diff --git a/examples/offline_inference/ming_flash_omni/README.md b/examples/offline_inference/ming_flash_omni/README.md
new file mode 100644
index 00000000000..7414163fc01
--- /dev/null
+++ b/examples/offline_inference/ming_flash_omni/README.md
@@ -0,0 +1,76 @@
+# Ming-flash-omni 2.0
+
+[Ming-flash-omni-2.0](https://github.com/inclusionAI/Ming) is an omni-modal model supporting text, image, video, and audio understanding, with outputs in text, image, and audio. For now, Ming-flash-omni-2.0 in vLLM-Omni is supported with thinker stage (multi-modal understanding).
+
+## Setup
+
+Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup.
+
+## Run examples
+
+### Text-only
+```bash
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type text
+```
+
+#### Reasoning (Thinking Mode)
+
+Reasoning (Thinking) mode is enabled via applying "detailed thinking on" when building the system prompt template (in `apply_chat_template`).
+
+In the end2end example, a default problem for thinking mode is provided, as referred to the example usage of Ming's cookbook;
+To utilize it, you have to download the example figure from https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/figures/cases/3_0.png
+
+```bash
+python examples/offline_inference/ming_flash_omni/end2end.py -q reasoning --image-path ./3_0.png
+```
+
+### Image understanding
+```bash
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image
+
+# With a local image
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image --image-path /path/to/image.jpg
+```
+
+### Audio understanding
+```bash
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio
+
+# With a local audio file
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio --audio-path /path/to/audio.wav
+```
+
+### Video understanding
+```bash
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_video
+
+# With a local video and custom frame count
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_video --video-path /path/to/video.mp4 --num-frames 16
+```
+
+### Mixed modalities (image + audio)
+```bash
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_mixed_modalities \
+    --image-path /path/to/image.jpg \
+    --audio-path /path/to/audio.wav
+```
+
+If media file paths are not provided, the script uses built-in default assets.
+
+### Modality control
+To control output modalities (e.g. text-only output):
+```bash
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio --modalities text
+```
+
+*For now, only text output is supported*
+
+### Custom stage config
+```bash
+python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image \
+    --stage-configs-path /path/to/your_config.yaml
+```
+
+## Online serving
+
+For online serving via the OpenAI-compatible API, see [examples/online_serving/ming_flash_omni/README.md](../../online_serving/ming_flash_omni/README.md).
diff --git a/examples/offline_inference/ming_flash_omni/end2end.py b/examples/offline_inference/ming_flash_omni/end2end.py
new file mode 100644
index 00000000000..49cdbcc0186
--- /dev/null
+++ b/examples/offline_inference/ming_flash_omni/end2end.py
@@ -0,0 +1,485 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Partial example cases are referred from
+# https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/cookbook.ipynb
+import os
+import time
+from typing import NamedTuple
+
+import librosa
+import numpy as np
+import vllm
+from PIL import Image
+from transformers import AutoProcessor
+from vllm import SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset, video_to_ndarrays
+from vllm.multimodal.image import convert_image_mode
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+import vllm_omni
+from vllm_omni.entrypoints.omni import Omni
+
+# Imports the processor also registers itself
+from vllm_omni.transformers_utils.processors.ming import MingFlashOmniProcessor  # noqa: F401
+
+SEED = 42
+MODEL_NAME = "Jonathan1909/Ming-flash-omni-2.0"
+
+
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+
+
+def get_text_query(processor: MingFlashOmniProcessor, question: str | None = None) -> QueryResult:
+    if question is None:
+        question = "请详细介绍鹦鹉的生活习性。"
+    conversation = [{"role": "HUMAN", "content": question}]
+    prompt = processor.apply_chat_template(conversation, tokenize=False)
+    return QueryResult(
+        inputs={"prompt": prompt},
+        limit_mm_per_prompt={},
+    )
+
+
+def get_image_query(
+    processor: MingFlashOmniProcessor,
+    question: str | None = None,
+    image_path: str | None = None,
+) -> QueryResult:
+    if question is None:
+        question = "Describe this image in detail."
+
+    if image_path:
+        if not os.path.exists(image_path):
+            raise FileNotFoundError(f"Image file not found: {image_path}")
+        image_data = convert_image_mode(Image.open(image_path), "RGB")
+    else:
+        image_data = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+
+    conversation = [
+        {
+            "role": "HUMAN",
+            "content": [
+                {"type": "image", "image": image_data},
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    prompt = processor.apply_chat_template(conversation, tokenize=False)
+
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {"image": image_data},
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+
+
+def get_audio_query(
+    processor: MingFlashOmniProcessor,
+    question: str | None = None,
+    audio_path: str | None = None,
+    sampling_rate: int = 16000,
+) -> QueryResult:
+    if question is None:
+        question = "Please recognize the language of this speech and transcribe it. Format: oral."
+
+    if audio_path:
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+        audio_signal, sr = librosa.load(audio_path, sr=sampling_rate)
+        audio_data = (audio_signal.astype(np.float32), sr)
+    else:
+        audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
+
+    # Use a string for "audio" so the processor counts it as 1 audio input
+    conversation = [
+        {
+            "role": "HUMAN",
+            "content": [
+                {"type": "audio", "audio": "input"},
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    prompt = processor.apply_chat_template(conversation, tokenize=False)
+
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {"audio": audio_data},
+        },
+        limit_mm_per_prompt={"audio": 1},
+    )
+
+
+def get_video_query(
+    processor: MingFlashOmniProcessor,
+    question: str | None = None,
+    video_path: str | None = None,
+    num_frames: int = 16,
+) -> QueryResult:
+    if question is None:
+        question = "Describe what is happening in this video."
+
+    if video_path:
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"Video file not found: {video_path}")
+        video_frames = video_to_ndarrays(video_path, num_frames=num_frames)
+    else:
+        video_frames = VideoAsset(name="baby_reading", num_frames=num_frames).np_ndarrays
+
+    conversation = [
+        {
+            "role": "HUMAN",
+            "content": [
+                {"type": "video"},
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    prompt = processor.apply_chat_template(conversation, tokenize=False)
+
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {"video": video_frames},
+        },
+        limit_mm_per_prompt={"video": 1},
+    )
+
+
+def get_mixed_modalities_query(
+    processor: MingFlashOmniProcessor,
+    image_path: str | None = None,
+    audio_path: str | None = None,
+    sampling_rate: int = 16000,
+) -> QueryResult:
+    """Mixed image + audio understanding."""
+    question = "Describe the image, and recognize the language of this speech and transcribe it. Format: oral"
+
+    if image_path:
+        if not os.path.exists(image_path):
+            raise FileNotFoundError(f"Image file not found: {image_path}")
+        image_data = convert_image_mode(Image.open(image_path), "RGB")
+    else:
+        image_data = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+
+    if audio_path:
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+        sig, sr = librosa.load(audio_path, sr=sampling_rate)
+        audio_data = (sig.astype(np.float32), sr)
+    else:
+        audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
+
+    conversation = [
+        {
+            "role": "HUMAN",
+            "content": [
+                {"type": "image", "image": image_data},
+                {"type": "audio", "audio": "input"},
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    prompt = processor.apply_chat_template(conversation, tokenize=False)
+
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {"image": image_data, "audio": audio_data},
+        },
+        limit_mm_per_prompt={"image": 1, "audio": 1},
+    )
+
+
+def get_reasoning_query(
+    processor: MingFlashOmniProcessor,
+    question: str | None = None,
+    image_path: str | None = None,
+) -> QueryResult:
+    if question is None:
+        # NOTE: To use the following default question, input with example figure provided by Ming
+        # https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/figures/cases/3_0.png
+        # E.g.,
+        # python examples/offline_inference/ming_flash_omni/end2end.py -q reasoning --image-path ./3_0.png
+        # Otherwise, the problem solving might be false.
+        question = (
+            "Based on the following rules:\n•\tYou control the smiley face character\n"
+            "•\tYou can move up, down, left, and right, and only a single square at a time\n"
+            "•\tWalls are dark grey and cannot be moved into\n•\tThe brown square is a box\n•"
+            "\tThe box can be pushed by moving into it (i.e., if you are in the square "
+            "adjacent to the box to the left, and move onto the square with the box, "
+            "the box will move one square to the right).\n"
+            "•\tThe box cannot be pushed into walls\n"
+            "•\tThe blue door at the bottom is locked and cannot be passed through, "
+            "unless the box is placed on the blue square\n"
+            "•\tThe square beneath the blue door is the exit\n"
+            "•\tMoving from one square to another\n\n"
+            "Let's assume a coordinate system where the smiley face is "
+            "on the top left at (1,1) and the square below it is (1,2). "
+            "The smiley face performs the following moves: {down, right, right, right}, "
+            "such that the smiley face is at square (4,2) and the box is in square (5,2). "
+            "What are the next sequence of moves that must be done to move the box down to (5,3)? "
+            "Give your answer as a comma separated list."
+        )
+
+    if image_path:
+        if not os.path.exists(image_path):
+            raise FileNotFoundError(f"Image file not found: {image_path}")
+        image_data = convert_image_mode(Image.open(image_path), "RGB")
+        conversation = [
+            {
+                "role": "HUMAN",
+                "content": [
+                    {"type": "image", "image": image_data},
+                    {"type": "text", "text": question},
+                ],
+            }
+        ]
+        prompt = processor.apply_chat_template(conversation, tokenize=False, use_cot_system_prompt=True)
+        return QueryResult(
+            inputs={
+                "prompt": prompt,
+                "multi_modal_data": {"image": image_data},
+            },
+            limit_mm_per_prompt={"image": 1},
+        )
+
+    conversation = [{"role": "HUMAN", "content": question}]
+    prompt = processor.apply_chat_template(conversation, tokenize=False, use_cot_system_prompt=True)
+    return QueryResult(
+        inputs={"prompt": prompt},
+        limit_mm_per_prompt={},
+    )
+
+
+query_map = {
+    "text": get_text_query,
+    "use_audio": get_audio_query,
+    "use_image": get_image_query,
+    "use_video": get_video_query,
+    "use_mixed_modalities": get_mixed_modalities_query,
+    "reasoning": get_reasoning_query,
+}
+
+
+def main(args):
+    print(
+        "=" * 20,
+        "\n",
+        f"vllm version: {vllm.__version__}\n",
+        f"vllm-omni version: {vllm_omni.__version__}\n",
+        "=" * 20,
+        sep="",
+    )
+
+    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    assert isinstance(processor, MingFlashOmniProcessor), f"Wrong processor type being used: {type(processor)}"
+
+    query_func = query_map[args.query_type]
+    if args.query_type == "use_image":
+        query_result = query_func(processor, image_path=args.image_path)
+    elif args.query_type == "use_audio":
+        query_result = query_func(processor, audio_path=args.audio_path, sampling_rate=args.sampling_rate)
+    elif args.query_type == "use_video":
+        query_result = query_func(processor, video_path=args.video_path, num_frames=args.num_frames)
+    elif args.query_type == "use_mixed_modalities":
+        query_result = query_func(
+            processor,
+            image_path=args.image_path,
+            audio_path=args.audio_path,
+            sampling_rate=args.sampling_rate,
+        )
+    elif args.query_type == "reasoning":
+        query_result = query_func(processor, image_path=args.image_path)
+    else:
+        query_result = query_func(processor)
+
+    # Initialize Omni (with thinker-only stage config)
+    omni = Omni(
+        model=MODEL_NAME,
+        stage_configs_path=args.stage_configs_path,
+        log_stats=args.log_stats,
+        init_timeout=args.init_timeout,
+        stage_init_timeout=args.stage_init_timeout,
+    )
+
+    # Thinker sampling params
+    thinker_sampling_params = SamplingParams(
+        temperature=0.4,
+        top_p=0.9,
+        max_tokens=args.max_tokens,
+        repetition_penalty=1.05,
+        seed=SEED,
+        detokenize=True,
+    )
+    sampling_params_list = [thinker_sampling_params]
+
+    prompts = [query_result.inputs for _ in range(args.num_prompts)]
+
+    if args.modalities is not None:
+        output_modalities = args.modalities.split(",")
+        for prompt in prompts:
+            prompt["modalities"] = output_modalities
+
+    total_requests = len(prompts)
+    processed_count = 0
+    print(f"Query type: {args.query_type}")
+    print(f"Number of prompts: {total_requests}")
+
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+
+    profiler_enabled = args.enable_profiler
+    if profiler_enabled:
+        omni.start_profile(stages=args.profiler_stages)
+
+    for stage_outputs in omni.generate(prompts, sampling_params_list):
+        output = stage_outputs.request_output
+        if stage_outputs.final_output_type == "text":
+            request_id = output.request_id
+            text_output = output.outputs[0].text
+            lines = []
+            lines.append("Prompt:\n")
+            lines.append(str(output.prompt) + "\n")
+            lines.append("Text Output:\n")
+            lines.append(str(text_output).strip() + "\n")
+            print(*lines, sep="")
+
+            # Save to file
+            out_txt = os.path.join(output_dir, f"{request_id}.txt")
+            try:
+                with open(out_txt, "w", encoding="utf-8") as f:
+                    f.writelines(lines)
+                print(f"Request ID: {request_id}, text saved to {out_txt}")
+            except Exception as e:
+                print(f"Failed to write output file {out_txt}: {e}")
+
+        elif stage_outputs.final_output_type == "audio":
+            raise NotImplementedError("Add audio example after talker supported.")
+
+        processed_count += 1
+        if profiler_enabled and processed_count >= total_requests:
+            print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...")
+            # Stop the profiler while workers are still alive
+            omni.stop_profile(stages=args.profiler_stages)
+
+            print("[Info] Waiting 30s for workers to write trace files to disk...")
+            time.sleep(30)
+            print("[Info] Trace export wait time finished.")
+
+    omni.close()
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(description="Ming-flash-omni 2.0 offline inference example")
+    parser.add_argument(
+        "--query-type",
+        "-q",
+        type=str,
+        default="text",
+        choices=query_map.keys(),
+        help="Query type.",
+    )
+    parser.add_argument(
+        "--stage-configs-path",
+        type=str,
+        default=None,
+        help="Path to a stage configs YAML file.",
+    )
+    parser.add_argument(
+        "--log-stats",
+        action="store_true",
+        default=False,
+        help="Enable detailed statistics logging.",
+    )
+    parser.add_argument("--init-timeout", type=int, default=2000, help="Timeout for initializing in seconds.")
+    parser.add_argument(
+        "--stage-init-timeout",
+        type=int,
+        default=2000,
+        help="Timeout for initializing a single stage in seconds.",
+    )
+    parser.add_argument(
+        "--enable-profiler",
+        action="store_true",
+        default=False,
+        help="Enables profiling when set.",
+    )
+    parser.add_argument(
+        "--profiler-stages",
+        type=int,
+        nargs="*",
+        default=[0],
+        help="List of stage IDs to profile. If not set, profiles all stages.",
+    )
+    parser.add_argument(
+        "--image-path",
+        "-i",
+        type=str,
+        default=None,
+        help="Path to local image file. Uses default asset if not provided.",
+    )
+    parser.add_argument(
+        "--audio-path",
+        "-a",
+        type=str,
+        default=None,
+        help="Path to local audio file. Uses default asset if not provided.",
+    )
+    parser.add_argument(
+        "--video-path",
+        "-v",
+        type=str,
+        default=None,
+        help="Path to local video file. Uses default asset if not provided.",
+    )
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=16,
+        help="Number of frames to extract from video.",
+    )
+    parser.add_argument(
+        "--sampling-rate",
+        type=int,
+        default=16000,
+        help="Sampling rate for audio loading.",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=16384,
+        help="Maximum tokens to generate.",
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1,
+        help="Number of prompts to generate.",
+    )
+    parser.add_argument(
+        "--modalities",
+        type=str,
+        default=None,
+        help="Output modalities (comma-separated).",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="output_ming",
+        help="Output directory for results.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/ming_flash_omni/README.md b/examples/online_serving/ming_flash_omni/README.md
new file mode 100644
index 00000000000..502232725c2
--- /dev/null
+++ b/examples/online_serving/ming_flash_omni/README.md
@@ -0,0 +1,204 @@
+# Ming-flash-omni 2.0
+
+## Installation
+
+Please refer to [README.md](../../../README.md)
+
+## Run examples (Ming-flash-omni 2.0)
+
+### Launch the Server
+
+```bash
+vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091
+```
+
+If you have custom stage configs file, launch the server with command below
+```bash
+vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
+```
+
+### Send Multi-modal Request
+
+#### Send request via python
+
+```bash
+python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py --model Jonathan1909/Ming-flash-omni-2.0 --query-type use_mixed_modalities --port 8091 --host "localhost" --modalities text
+```
+
+The Python client supports the following command-line arguments:
+
+- `--query-type` (or `-q`): Query type. Options: `text`, `use_audio`, `use_image`, `use_video`, `use_mixed_modalities`
+- `--video-path` (or `-v`): Path to local video file or URL. If not provided and query-type uses video, uses default video URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs. Example: `--video-path /path/to/video.mp4` or `--video-path https://example.com/video.mp4`
+- `--image-path` (or `-i`): Path to local image file or URL. If not provided and query-type uses image, uses default image URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common image formats: JPEG, PNG, GIF, WebP. Example: `--image-path /path/to/image.jpg` or `--image-path https://example.com/image.png`
+- `--audio-path` (or `-a`): Path to local audio file or URL. If not provided and query-type uses audio, uses default audio URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common audio formats: MP3, WAV, OGG, FLAC, M4A. Example: `--audio-path /path/to/audio.wav` or `--audio-path https://example.com/audio.mp3`
+- `--prompt` (or `-p`): Custom text prompt/question. If not provided, uses default prompt for the selected query type. Example: `--prompt "What are the main activities shown in this video?"`
+- `--modalities`: Output modalities. For now, only `text` is supported. Example: `--modalities text`
+
+
+#### Send request via curl
+
+```bash
+bash run_curl_multimodal_generation.sh text
+bash run_curl_multimodal_generation.sh use_image
+bash run_curl_multimodal_generation.sh use_audio
+bash run_curl_multimodal_generation.sh use_video
+bash run_curl_multimodal_generation.sh use_mixed_modalities
+```
+
+## Modality control
+
+Ming-flash-omni 2.0 currently supports text output only (thinker stage).
+
+| Modalities | Output |
+|------------|--------|
+| `["text"]` | Text only |
+| Not specified | Text only (default) |
+
+### Using curl
+
+```bash
+curl http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Jonathan1909/Ming-flash-omni-2.0",
+    "messages": [
+      {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]},
+      {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"}
+    ],
+    "modalities": ["text"]
+  }'
+```
+
+### Using OpenAI Python SDK
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY")
+
+response = client.chat.completions.create(
+    model="Jonathan1909/Ming-flash-omni-2.0",
+    messages=[
+        {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]},
+        {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"},
+    ],
+    modalities=["text"],
+)
+print(response.choices[0].message.content)
+```
+
+### Multi-modal input with OpenAI Python SDK
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY")
+
+response = client.chat.completions.create(
+    model="Jonathan1909/Ming-flash-omni-2.0",
+    messages=[
+        {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg"}},
+                {"type": "text", "text": "Describe this image in detail."},
+            ],
+        },
+    ],
+    modalities=["text"],
+)
+print(response.choices[0].message.content)
+```
+
+## Streaming Output
+
+To enable streaming output:
+
+```bash
+python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py \
+    --query-type use_image \
+    --model Jonathan1909/Ming-flash-omni-2.0 \
+    --modalities text \
+    --stream
+```
+
+Or with the OpenAI Python SDK:
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY")
+
+response = client.chat.completions.create(
+    model="Jonathan1909/Ming-flash-omni-2.0",
+    messages=[
+        {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]},
+        {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"},
+    ],
+    modalities=["text"],
+    stream=True,
+)
+for chunk in response:
+    for choice in chunk.choices:
+        if hasattr(choice, "delta") and choice.delta.content:
+            print(choice.delta.content, end="", flush=True)
+print()
+```
+
+Or using curl:
+
+```bash
+curl http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Jonathan1909/Ming-flash-omni-2.0",
+    "messages": [
+      {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]},
+      {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"}
+    ],
+    "modalities": ["text"],
+    "stream": true,
+  }'
+```
+
+
+## Reasoning (Thinking Mode)
+
+To enable reasoning/thinking mode, change `detailed thinking off` to `detailed thinking on` in the system prompt:
+
+### Using curl
+
+```bash
+curl http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Jonathan1909/Ming-flash-omni-2.0",
+    "messages": [
+      {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking on"}]},
+      {"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "https://example.com/math_problem.png"}},
+        {"type": "text", "text": "Solve this math problem step by step."}
+      ]}
+    ],
+    "modalities": ["text"]
+  }'
+```
+
+### Using OpenAI Python SDK
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY")
+
+response = client.chat.completions.create(
+    model="Jonathan1909/Ming-flash-omni-2.0",
+    messages=[
+        {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking on"}]},
+        {"role": "user", "content": "If a train travels 120 km in 2 hours, what is its average speed?"},
+    ],
+    modalities=["text"],
+)
+print(response.choices[0].message.content)
+```
diff --git a/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh b/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh
new file mode 100755
index 00000000000..768a424e451
--- /dev/null
+++ b/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Server port
+PORT="${PORT:-8091}"
+# Default query type
+QUERY_TYPE="${1:-text}"
+
+# Validate query type
+if [[ ! "$QUERY_TYPE" =~ ^(text|use_audio|use_image|use_video|use_mixed_modalities)$ ]]; then
+    echo "Error: Invalid query type '$QUERY_TYPE'"
+    echo "Usage: $0 [text|use_audio|use_image|use_video|use_mixed_modalities]"
+    echo "  text: Text-only query"
+    echo "  use_audio: Audio + Text query"
+    echo "  use_image: Image + Text query"
+    echo "  use_video: Video + Text query"
+    echo "  use_mixed_modalities: Audio + Image + Video + Text query"
+    exit 1
+fi
+
+thinker_sampling_params='{
+  "temperature": 0.4,
+  "top_p": 0.9,
+  "top_k": -1,
+  "max_tokens": 16384,
+  "seed": 42,
+  "detokenize": true,
+  "repetition_penalty": 1.05
+}'
+# Above is optional, it has a default setting in stage_configs of the corresponding model.
+
+# Define URLs for assets
+MARY_HAD_LAMB_AUDIO_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/mary_had_lamb.ogg"
+CHERRY_BLOSSOM_IMAGE_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg"
+SAMPLE_VIDEO_URL="https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4"
+
+# Build user content based on query type
+case "$QUERY_TYPE" in
+  text)
+    user_content='[
+      {
+        "type": "text",
+        "text": "请详细介绍鹦鹉的生活习性。"
+      }
+    ]'
+    ;;
+  use_image)
+    user_content='[
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'"
+          }
+        },
+        {
+          "type": "text",
+          "text": "Describe this image in detail."
+        }
+      ]'
+    ;;
+  use_audio)
+    user_content='[
+        {
+          "type": "audio_url",
+          "audio_url": {
+            "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'"
+          }
+        },
+        {
+          "type": "text",
+          "text": "Please recognize the language of this speech and transcribe it. Format: oral."
+        }
+      ]'
+    ;;
+  use_video)
+    user_content='[
+        {
+          "type": "video_url",
+          "video_url": {
+            "url": "'"$SAMPLE_VIDEO_URL"'"
+          }
+        },
+        {
+          "type": "text",
+          "text": "Describe what is happening in this video."
+        }
+      ]'
+    ;;
+  use_mixed_modalities)
+    user_content='[
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'"
+          }
+        },
+        {
+          "type": "audio_url",
+          "audio_url": {
+            "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'"
+          }
+        },
+        {
+          "type": "text",
+          "text": "Describe the image, and recognize the language of this speech and transcribe it. Format: oral"
+        }
+      ]'
+    ;;
+esac
+
+echo "Running query type: $QUERY_TYPE"
+echo ""
+
+request_body=$(cat <<EOF
+{
+  "model": "Jonathan1909/Ming-flash-omni-2.0",
+  "sampling_params_list": [
+    $thinker_sampling_params
+  ],
+  "modalities": ["text"],
+  "messages": [
+    {
+      "role": "system",
+      "content": [
+        {
+          "type": "text",
+          "text": "你是一个友好的AI助手。\n\ndetailed thinking off"
+        }
+      ]
+    },
+    {
+      "role": "user",
+      "content": $user_content
+    }
+  ]
+}
+EOF
+)
+
+output=$(curl -sS --retry 3 --retry-delay 3 --retry-connrefused \
+    -X POST http://localhost:${PORT}/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d "$request_body")
+
+echo "Output of request: $(echo "$output" | jq '.choices[0].message.content')"
diff --git a/tests/conftest.py b/tests/conftest.py
index ad1008b7263..3434eb0aed3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3026,6 +3026,10 @@ def get_omni_inputs(
             video_padding_token = "<|video_pad|>"
             image_padding_token = "<|image_pad|>"
             audio_padding_token = "<|audio_pad|>"
+        elif "Ming-flash-omni" in self.model_name:
+            video_padding_token = "<VIDEO>"
+            image_padding_token = "<IMAGE>"
+            audio_padding_token = "<AUDIO>"
 
         if isinstance(prompts, str):
             prompts = [prompts]
diff --git a/tests/e2e/offline_inference/test_ming_flash_omni.py b/tests/e2e/offline_inference/test_ming_flash_omni.py
new file mode 100644
index 00000000000..be0ed3b056f
--- /dev/null
+++ b/tests/e2e/offline_inference/test_ming_flash_omni.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
+
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import (
+    generate_synthetic_audio,
+    generate_synthetic_image,
+    generate_synthetic_video,
+    modify_stage_config,
+)
+from tests.utils import hardware_test
+
+models = ["Jonathan1909/Ming-flash-omni-2.0"]
+
+# Ming-specific
+SYSTEM_PROMPT = "你是一个友好的AI助手。\n\ndetailed thinking off"
+EOS_TOKEN = "<|role_end|>"
+IMAGE_TOKEN = "<IMAGE>"
+VIDEO_TOKEN = "<VIDEO>"
+AUDIO_TOKEN = "<AUDIO>"
+
+
+def build_prompt(user_text: str) -> str:
+    """Build a Ming chat prompt."""
+    return (
+        f"<role>SYSTEM</role>{SYSTEM_PROMPT}{EOS_TOKEN}<role>HUMAN</role>{user_text}{EOS_TOKEN}<role>ASSISTANT</role>"
+    )
+
+
+def get_eager_config():
+    path = modify_stage_config(
+        str(Path(__file__).parent.parent / "stage_configs" / "bailingmm_moe_v2_lite_ci.yaml"),
+        updates={
+            "stage_args": {
+                0: {
+                    "engine_args.enforce_eager": "true",
+                },
+            },
+        },
+    )
+    return path
+
+
+stage_configs = [get_eager_config()]
+test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_text_to_text(omni_runner, omni_runner_handler) -> None:
+    """
+    Test text-only input processing and text output generation.
+    Input Modal: text
+    Output Modal: text
+    """
+    prompt = build_prompt("请详细介绍鹦鹉的生活习性。")
+    request_config = {"prompts": prompt, "modalities": ["text"]}
+
+    omni_runner_handler.send_request(request_config)
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_image_to_text(omni_runner, omni_runner_handler) -> None:
+    """
+    Test image understanding with text output.
+    Input Modal: image + text
+    Output Modal: text
+    """
+    image = generate_synthetic_image(224, 224)["np_array"]
+    prompt = build_prompt(f"{IMAGE_TOKEN}Describe this image briefly.")
+    request_config = {"prompts": prompt, "images": image, "modalities": ["text"]}
+
+    omni_runner_handler.send_request(request_config)
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_audio_to_text(omni_runner, omni_runner_handler) -> None:
+    """
+    Test audio understanding with text output.
+    Input Modal: audio + text
+    Output Modal: text
+    """
+    audio = generate_synthetic_audio(2, 1, 16000)["np_array"]
+    if len(audio.shape) == 2:
+        audio = audio.squeeze()
+    prompt = build_prompt(f"{AUDIO_TOKEN}Please recognize the language of this speech and transcribe it. Format: oral.")
+    request_config = {"prompts": prompt, "audios": audio, "modalities": ["text"]}
+
+    omni_runner_handler.send_request(request_config)
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_video_to_text(omni_runner, omni_runner_handler) -> None:
+    """
+    Test video understanding with text output.
+    Input Modal: video + text
+    Output Modal: text
+    """
+    video = generate_synthetic_video(224, 224, 30)["np_array"]
+    prompt = build_prompt(f"{VIDEO_TOKEN}Describe what is happening in this video.")
+    request_config = {"prompts": prompt, "videos": video, "modalities": ["text"]}
+
+    omni_runner_handler.send_request(request_config)
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
+def test_mixed_to_text(omni_runner, omni_runner_handler) -> None:
+    """
+    Test mixed modality input (image + audio) with text output.
+    Input Modal: image + audio + text
+    Output Modal: text
+    """
+    image = generate_synthetic_image(224, 224)["np_array"]
+    audio = generate_synthetic_audio(2, 1, 16000)["np_array"]
+    if len(audio.shape) == 2:
+        audio = audio.squeeze()
+    prompt = build_prompt(f"{IMAGE_TOKEN}{AUDIO_TOKEN}Describe the image and transcribe the audio.")
+    request_config = {"prompts": prompt, "images": image, "audios": audio, "modalities": ["text"]}
+
+    omni_runner_handler.send_request(request_config)
diff --git a/tests/e2e/online_serving/test_ming_flash_omni.py b/tests/e2e/online_serving/test_ming_flash_omni.py
new file mode 100644
index 00000000000..35b7b64c061
--- /dev/null
+++ b/tests/e2e/online_serving/test_ming_flash_omni.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E online serving tests for Ming-flash-omni-2.0 model (Thinker stage).
+Tests multimodal understanding via OpenAI-compatible API.
+"""
+
+import os
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import (
+    OmniServerParams,
+    dummy_messages_from_mix_data,
+    generate_synthetic_audio,
+    generate_synthetic_image,
+    generate_synthetic_video,
+    modify_stage_config,
+)
+from tests.utils import hardware_test
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
+
+models = ["Jonathan1909/Ming-flash-omni-2.0"]
+
+
+def get_eager_config():
+    path = modify_stage_config(
+        str(Path(__file__).parent.parent / "stage_configs" / "bailingmm_moe_v2_lite_ci.yaml"),
+        updates={
+            "stage_args": {
+                0: {
+                    "engine_args.enforce_eager": "true",
+                },
+            },
+        },
+    )
+    return path
+
+
+stage_configs = [get_eager_config()]
+
+# Create parameter combinations for model and stage config
+test_params = [
+    OmniServerParams(model=model, stage_config_path=stage_config) for model in models for stage_config in stage_configs
+]
+
+
+def get_system_prompt():
+    return {
+        "role": "system",
+        "content": [
+            {
+                "type": "text",
+                "text": "你是一个友好的AI助手。\n\ndetailed thinking off",
+            }
+        ],
+    }
+
+
+def get_prompt(prompt_type="text_only"):
+    prompts = {
+        "text_only": "What is the capital of China? Answer in 20 words.",
+        "text_image": "What is in this image?",
+        "text_audio": "What is in this audio?",
+        "text_video": "What is in this video?",
+        "mix": "What is recited in the audio? What is in this image? What is in this video?",
+    }
+    return prompts.get(prompt_type, prompts["text_only"])
+
+
+def get_max_batch_size(size_type="few"):
+    batch_sizes = {"few": 5, "medium": 100, "large": 256}
+    return batch_sizes.get(size_type, 5)
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_text_to_text_001(omni_server, openai_client) -> None:
+    """
+    Input Modal: text
+    Output Modal: text
+    Input Setting: stream=False
+    Datasets: single request
+    """
+    messages = dummy_messages_from_mix_data(
+        system_prompt=get_system_prompt(),
+        content_text=get_prompt("text_only"),
+    )
+
+    request_config = {
+        "model": omni_server.model,
+        "messages": messages,
+        "stream": False,
+        "modalities": ["text"],
+        "key_words": {"text": ["beijing"]},
+    }
+
+    openai_client.send_omni_request(request_config)
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_text_to_text_stream_001(omni_server, openai_client) -> None:
+    """
+    Input Modal: text
+    Output Modal: text
+    Input Setting: stream=True
+    Datasets: few requests
+    """
+    messages = dummy_messages_from_mix_data(
+        system_prompt=get_system_prompt(),
+        content_text=get_prompt("text_only"),
+    )
+
+    request_config = {
+        "model": omni_server.model,
+        "messages": messages,
+        "stream": True,
+        "modalities": ["text"],
+        "key_words": {"text": ["beijing"]},
+    }
+
+    openai_client.send_omni_request(request_config, request_num=get_max_batch_size())
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_image_to_text_001(omni_server, openai_client) -> None:
+    """
+    Input Modal: image + text
+    Output Modal: text
+    Input Setting: stream=True
+    Datasets: single request
+    """
+    image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}"
+    messages = dummy_messages_from_mix_data(
+        system_prompt=get_system_prompt(),
+        image_data_url=image_data_url,
+        content_text=get_prompt("text_image"),
+    )
+
+    request_config = {
+        "model": omni_server.model,
+        "messages": messages,
+        "stream": True,
+        "modalities": ["text"],
+    }
+
+    openai_client.send_omni_request(request_config)
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_audio_to_text_001(omni_server, openai_client) -> None:
+    """
+    Input Modal: audio + text
+    Output Modal: text
+    Input Setting: stream=True
+    Datasets: single request
+    """
+    audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(2, 1)['base64']}"
+    messages = dummy_messages_from_mix_data(
+        system_prompt=get_system_prompt(),
+        audio_data_url=audio_data_url,
+        content_text=get_prompt("text_audio"),
+    )
+
+    request_config = {
+        "model": omni_server.model,
+        "messages": messages,
+        "stream": True,
+        "modalities": ["text"],
+    }
+
+    openai_client.send_omni_request(request_config)
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_video_to_text_001(omni_server, openai_client) -> None:
+    """
+    Input Modal: video + text
+    Output Modal: text
+    Input Setting: stream=False
+    Datasets: single request
+    """
+    video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}"
+    messages = dummy_messages_from_mix_data(
+        system_prompt=get_system_prompt(),
+        video_data_url=video_data_url,
+        content_text=get_prompt("text_video"),
+    )
+
+    request_config = {
+        "model": omni_server.model,
+        "messages": messages,
+        "stream": False,
+        "modalities": ["text"],
+    }
+
+    openai_client.send_omni_request(request_config)
+
+
+@pytest.mark.advanced_model
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "H100"}, num_cards=4)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_mix_to_text_001(omni_server, openai_client) -> None:
+    """
+    Input Modal: text + audio + image + video
+    Output Modal: text
+    Input Setting: stream=True
+    Datasets: single request
+    """
+    video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}"
+    image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}"
+    audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(2, 1)['base64']}"
+    messages = dummy_messages_from_mix_data(
+        system_prompt=get_system_prompt(),
+        video_data_url=video_data_url,
+        image_data_url=image_data_url,
+        audio_data_url=audio_data_url,
+        content_text=get_prompt("mix"),
+    )
+
+    request_config = {
+        "model": omni_server.model,
+        "messages": messages,
+        "stream": True,
+        "modalities": ["text"],
+    }
+
+    openai_client.send_omni_request(request_config)
diff --git a/tests/e2e/stage_configs/bailingmm_moe_v2_lite_ci.yaml b/tests/e2e/stage_configs/bailingmm_moe_v2_lite_ci.yaml
new file mode 100644
index 00000000000..fb0c72cc513
--- /dev/null
+++ b/tests/e2e/stage_configs/bailingmm_moe_v2_lite_ci.yaml
@@ -0,0 +1,35 @@
+# Thinker stage only
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      devices: "0,1,2,3"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: MingFlashOmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.9
+      enforce_eager: false
+      trust_remote_code: true
+      engine_output_type: latent
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      max_model_len: 32768
+      tensor_parallel_size: 4
+      hf_config_name: llm_config
+      load_format: dummy
+      mm_processor_cache_gb: 0
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      max_tokens: 100
+      repetition_penalty: 1.05
+      seed: 42
+      detokenize: true
+      ignore_eos: false
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/__init__.py b/vllm_omni/model_executor/models/ming_flash_omni/__init__.py
new file mode 100644
index 00000000000..d7fa44fd7e4
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_flash_omni/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+
+from .ming_flash_omni import MingFlashOmniForConditionalGeneration
+from .ming_flash_omni_thinker import (
+    MingFlashOmniThinkerDummyInputsBuilder,
+    MingFlashOmniThinkerForConditionalGeneration,
+    MingFlashOmniThinkerMultiModalProcessor,
+    MingFlashOmniThinkerProcessingInfo,
+)
+
+__all__ = [
+    "MingFlashOmniForConditionalGeneration",
+    "MingFlashOmniThinkerForConditionalGeneration",
+    "MingFlashOmniThinkerProcessingInfo",
+    "MingFlashOmniThinkerMultiModalProcessor",
+    "MingFlashOmniThinkerDummyInputsBuilder",
+]
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/audio_encoder.py b/vllm_omni/model_executor/models/ming_flash_omni/audio_encoder.py
new file mode 100644
index 00000000000..6ca19901141
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_flash_omni/audio_encoder.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+# Copyright 2024 ANT Group and the HuggingFace Inc. team.
+# Copyright (c) 2022 OpenAI
+# Adapted from Ming repository modeling_whisper_encoder.py
+# https://github.com/inclusionAI/Ming
+
+import operator
+from collections.abc import Iterable
+from itertools import accumulate
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+from vllm_omni.diffusion.attention.backends.utils.fa import HAS_FLASH_ATTN, flash_attn_varlen_func
+from vllm_omni.model_executor.models.whisper_utils import Conv1d, Linear, sinusoids
+
+logger = init_logger(__name__)
+
+
+class MultiHeadAttention(nn.Module):
+    """Multi-head attention with packed sequence support.
+    Adapted from Qwen3-TTS WhisperEncoder.
+    """
+
+    def __init__(self, n_state: int, n_head: int, use_flash_attn: bool = True):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+
+        if use_flash_attn and not HAS_FLASH_ATTN:
+            logger.warning("flash-attn is not available. Fallback to manual PyTorch version")
+        self.use_flash_attn = use_flash_attn and HAS_FLASH_ATTN
+
+    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
+        """Forward pass with packed sequence support.
+
+        Args:
+            x: [total_tokens, n_state] packed sequence
+            cu_seqlens: [num_seqs + 1] cumulative sequence lengths, e.g. [0, len1, len1+len2, ...]
+
+        Returns:
+            [total_tokens, n_state] attention output
+        """
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+
+        n_ctx, n_state = q.shape
+        head_dim = n_state // self.n_head
+
+        q = q.view(n_ctx, self.n_head, head_dim)
+        k = k.view(n_ctx, self.n_head, head_dim)
+        v = v.view(n_ctx, self.n_head, head_dim)
+
+        # Try flash attention varlen
+        if self.use_flash_attn and cu_seqlens is not None and q.dtype in [torch.float16, torch.bfloat16]:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen)
+        else:
+            attn_output = self._manual_attention(q, k, v, cu_seqlens)
+
+        # Reshape back: [T, H, D] -> [T, H*D]
+        attn_output = attn_output.contiguous().view(n_ctx, n_state)
+        return self.out(attn_output)
+
+    def _manual_attention(
+        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, cu_seqlens: torch.Tensor
+    ) -> torch.Tensor:
+        """Manual attention for variable-length sequences (fallback)."""
+        _, n_head, head_dim = q.shape
+        scale = head_dim**-0.5
+
+        # Unpack sequences and pad to max length
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        batch_size = len(seqlens)
+        max_seqlen = max(seqlens)
+
+        # Create padded tensors
+        q_padded = torch.zeros(batch_size, max_seqlen, n_head, head_dim, dtype=q.dtype, device=q.device)
+        k_padded = torch.zeros_like(q_padded)
+        v_padded = torch.zeros_like(q_padded)
+
+        # Fill with actual sequences
+        for i in range(batch_size):
+            start_idx = cu_seqlens[i]
+            end_idx = cu_seqlens[i + 1]
+            seq_len = seqlens[i]
+            q_padded[i, :seq_len] = q[start_idx:end_idx]
+            k_padded[i, :seq_len] = k[start_idx:end_idx]
+            v_padded[i, :seq_len] = v[start_idx:end_idx]
+
+        # Transpose for attention: [B, H, T, D]
+        q_padded = q_padded.transpose(1, 2)
+        k_padded = k_padded.transpose(1, 2)
+        v_padded = v_padded.transpose(1, 2)
+
+        # Create attention mask for variable lengths: 0 for valid positions, -inf for padding
+        padding_mask = (
+            torch.arange(max_seqlen, device=q.device)[None, :] >= torch.tensor(seqlens, device=q.device)[:, None]
+        )
+        attn_mask = torch.zeros(batch_size, 1, 1, max_seqlen, dtype=q.dtype, device=q.device)
+        attn_mask = attn_mask.masked_fill(padding_mask.unsqueeze(1).unsqueeze(2), -torch.finfo(q.dtype).max)
+
+        # Compute attention
+        attn_scores = torch.matmul(q_padded, k_padded.transpose(-2, -1)) * scale
+        attn_scores = attn_scores + attn_mask
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        context = torch.matmul(attn_weights, v_padded)
+
+        # Transpose back: [B, H, T, D] -> [B, T, H, D]
+        context = context.transpose(1, 2).contiguous()
+        output_packed = torch.cat([context[i, : seqlens[i]] for i in range(batch_size)], dim=0)
+
+        return output_packed
+
+
+class ResidualAttentionBlock(nn.Module):
+    """Whisper-style residual attention block with packed sequence support.
+
+    Adapted from
+    https://github.com/openai/whisper/blob/v20250625/whisper/model.py
+    vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py
+    """
+
+    def __init__(self, n_state: int, n_head: int, use_flash_attn: bool = True):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head, use_flash_attn=use_flash_attn)
+        self.attn_ln = nn.LayerNorm(n_state)
+
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp),
+            nn.GELU(),
+            Linear(n_mlp, n_state),
+        )
+        self.mlp_ln = nn.LayerNorm(n_state)
+
+    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.attn_ln(x), cu_seqlens=cu_seqlens)
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+
+
+class WhisperAudioEncoder(nn.Module):
+    """Whisper audio encoder for Ming with packed sequence support.
+
+    Adapted from
+    https://github.com/openai/whisper/blob/v20250625/whisper/model.py
+    vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py
+    """
+
+    def __init__(
+        self,
+        n_mels: int = 128,
+        n_ctx: int = 15000,
+        n_state: int = 1280,
+        n_head: int = 20,
+        n_layer: int = 32,
+        use_flash_attn: bool = True,
+    ):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        # self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.blocks = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head, use_flash_attn=use_flash_attn) for _ in range(n_layer)]
+        )
+        self.ln_post = nn.LayerNorm(n_state)
+        self.audio_emb_dim = n_state
+
+        self.n_layer = n_layer
+        self.n_mels = n_mels
+        self.use_flash_attn = use_flash_attn
+
+    def forward(
+        self,
+        x_list: list[torch.Tensor],
+        audio_lens: list[int],
+    ) -> torch.Tensor:
+        """Forward pass with packed sequence format for variable-length inputs.
+
+        Args:
+            x_list: List of [n_mels, T_i] mel spectrogram features for each audio
+            audio_lens: List of original audio lengths in frames
+
+        Returns:
+            [total_T', n_state] packed encoded audio features, where
+            total_T' is the sum of all encoded sequence lengths
+        """
+        # Cast inputs to model dtype
+        target_dtype = self.conv1.weight.dtype
+        x_list = [x.to(target_dtype) for x in x_list]
+
+        encoded_list = []
+        encoded_lens = []
+        for mel_spec in x_list:
+            # mel_spec: [n_mels, T] - process through conv layers
+            x = mel_spec.unsqueeze(0)  # [1, n_mels, T]
+            x = F.gelu(self.conv1(x))
+            x = F.gelu(self.conv2(x))
+            x = x.squeeze(0).transpose(0, 1)  # [T', n_state]
+
+            # Add positional embedding
+            seq_len = x.shape[0]
+            positional_embedding = self.positional_embedding[:seq_len, :]
+            x = (x + positional_embedding).to(x.dtype)
+
+            encoded_list.append(x)
+            encoded_lens.append(seq_len)
+
+        x_packed = torch.cat(encoded_list, dim=0)  # [total_T', n_state]
+
+        cu_seqlens = list(accumulate(encoded_lens, func=operator.add, initial=0))
+        cu_seqlens = torch.tensor(cu_seqlens, device=x_packed.device, dtype=torch.int32)
+
+        for block in self.blocks:
+            x_packed = block(x_packed, cu_seqlens=cu_seqlens)
+
+        x_packed = self.ln_post(x_packed)
+        return x_packed
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        params_dict: dict[str, torch.Tensor] = {
+            **dict(self.named_parameters(remove_duplicate=False)),
+            **dict(self.named_buffers()),
+        }
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                logger.warning("Skipping unknown audio encoder weight: %s", name)
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni.py b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni.py
new file mode 100644
index 00000000000..87728890b67
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+# Copyright 2024 ANT Group and the HuggingFace Inc. team. All rights reserved.
+# Adapted from Ming repository modeling_bailingmm2.py
+# https://github.com/inclusionAI/Ming
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Ming-flash-omni-2.0 unified model (thinker + imagegen + talker)."""
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.models.interfaces import (
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.utils import (
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.sequence import IntermediateTensors
+
+from vllm_omni.model_executor.custom_process_mixin import CustomProcessMixin
+from vllm_omni.model_executor.models.output_templates import OmniOutput
+from vllm_omni.model_executor.models.utils import add_prefix_to_loaded_weights
+from vllm_omni.transformers_utils.configs.ming_flash_omni import BailingMM2Config, MingFlashOmniConfig
+
+from .ming_flash_omni_thinker import (
+    MingFlashOmniThinkerDummyInputsBuilder,
+    MingFlashOmniThinkerMultiModalProcessor,
+    MingFlashOmniThinkerProcessingInfo,
+)
+
+logger = init_logger(__name__)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MingFlashOmniThinkerMultiModalProcessor,
+    info=MingFlashOmniThinkerProcessingInfo,
+    dummy_inputs=MingFlashOmniThinkerDummyInputsBuilder,
+)
+class MingFlashOmniForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsMRoPE,
+    CustomProcessMixin,
+):
+    """Unified Ming-flash-omni-2.0 model combining thinker, imagegen, and talker."""
+
+    supports_multimodal = True
+    requires_raw_input_tokens: bool = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.have_multimodal_outputs = True
+        self.has_preprocess = False
+        self.has_postprocess = False
+
+        config = vllm_config.model_config.hf_config
+
+        self.vllm_config = vllm_config
+        self.config = config
+
+        if isinstance(config, MingFlashOmniConfig):
+            thinker_config = config.thinker_config
+        else:
+            thinker_config = config
+
+        self.thinker_config: BailingMM2Config = thinker_config
+        self.model_stage = vllm_config.model_config.model_stage
+
+        if self.model_stage == "thinker":
+            thinker_vllm_config = vllm_config.with_hf_config(
+                thinker_config, architectures=["MingFlashOmniThinkerForConditionalGeneration"]
+            )
+            self.thinker = init_vllm_registered_model(
+                vllm_config=thinker_vllm_config,
+                prefix=maybe_prefix(prefix, "thinker"),
+                architectures=["MingFlashOmniThinkerForConditionalGeneration"],
+            )
+            self.model = self.thinker
+            self.imagegen = None
+            self.talker = None
+
+        elif self.model_stage == "imagegen":
+            # TODO: Implement image generator stage
+            raise NotImplementedError(
+                "Image generation stage is not yet implemented. Please use model_stage='thinker' for now."
+            )
+
+        elif self.model_stage == "talker":
+            # TODO: Implement talker (TTS) stage
+            raise NotImplementedError(
+                "Talker (TTS) stage is not yet implemented. Please use model_stage='thinker' for now."
+            )
+
+        else:
+            raise ValueError(
+                f"Invalid model_stage: {self.model_stage}. Must be one of: 'thinker', 'imagegen', 'talker'"
+            )
+
+        # Set up intermediate tensors
+        self.make_empty_intermediate_tensors = (
+            self.thinker.make_empty_intermediate_tensors if self.model_stage == "thinker" else lambda: None
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> OmniOutput:
+        return self.model.forward(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata=None,
+    ) -> torch.Tensor | None:
+        if hasattr(self.model, "compute_logits"):
+            return self.model.compute_logits(hidden_states, sampling_metadata)
+        return None
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata,
+    ):
+        if hasattr(self.model, "sample"):
+            return self.model.sample(logits, sampling_metadata)
+        raise NotImplementedError("sample method not available on current stage")
+
+    def get_mrope_input_positions(self, *args, **kwargs):
+        if hasattr(self.model, "get_mrope_input_positions"):
+            return self.model.get_mrope_input_positions(*args, **kwargs)
+        raise NotImplementedError("get_mrope_input_positions not available on current stage")
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loaded_weights = set()
+        thinker_weights = []
+        imagegen_weights = []
+        talker_weights = []
+
+        for name, value in weights:
+            if name.startswith("thinker."):
+                thinker_weights.append((name, value))
+            elif name.startswith("imagegen."):
+                imagegen_weights.append((name, value))
+            elif name.startswith("talker."):
+                talker_weights.append((name, value))
+            else:
+                # Weights without prefix go to thinker by default
+                thinker_weights.append((name, value))
+
+        if self.model_stage == "thinker" and thinker_weights:
+            # Remove "thinker." prefix before loading
+            thinker_weights_stripped = [
+                (name.replace("thinker.", "", 1) if name.startswith("thinker.") else name, value)
+                for name, value in thinker_weights
+            ]
+            thinker_loaded = self.thinker.load_weights(thinker_weights_stripped)
+            thinker_loaded = add_prefix_to_loaded_weights(thinker_loaded, "thinker")
+            loaded_weights.update(thinker_loaded)
+
+        # TODO: Load imagegen weights when implemented
+        # TODO: Load talker weights when implemented
+
+        return loaded_weights
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        return MultiModelKeys.from_string_field(
+            language_model="thinker.language_model",
+            connector=["thinker.linear_proj.", "thinker.linear_proj_audio."],
+            tower_model=["thinker.vision.", "thinker.audio."],
+        )
+
+    @property
+    def sampler(self):
+        if hasattr(self.model, "sampler"):
+            return self.model.sampler
+        return None
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings=None,
+        *,
+        is_multimodal=None,
+    ) -> torch.Tensor:
+        return self.model.embed_input_ids(
+            input_ids,
+            multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
+
+    def embed_multimodal(self, **kwargs):
+        return self.model.embed_multimodal(**kwargs)
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py
new file mode 100644
index 00000000000..bde7477b945
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py
@@ -0,0 +1,893 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+# Copyright 2024 ANT Group and the HuggingFace Inc. team.
+# Adapted from Ming repository modeling_bailingmm2.py and processing_bailingmm2.py
+# https://github.com/inclusionAI/Ming
+
+"""Ming-flash-omni-2.0 Thinker stage implementation (multimodal understanding)."""
+
+from collections.abc import Iterable, Iterator, Mapping, Sequence
+from typing import Annotated, Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.feature_extraction_utils import BatchFeature
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.inputs import MultiModalDataDict
+from vllm.logger import init_logger
+from vllm.model_executor.models.interfaces import (
+    MultiModalEmbeddings,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
+from vllm.model_executor.models.qwen2_5_vl import (
+    Qwen2_5_VLImageInputs,
+    Qwen2_5_VLImagePixelInputs,
+    Qwen2_5_VLVideoInputs,
+    Qwen2_5_VLVideoPixelInputs,
+)
+from vllm.model_executor.models.qwen2_vl import (
+    Qwen2VLProcessingInfo,
+)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    _merge_multimodal_embeddings,
+    maybe_prefix,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    AudioProcessorItems,
+    ImageProcessorItems,
+    MultiModalDataItems,
+    MultiModalDataParser,
+    VideoProcessorItems,
+)
+from vllm.multimodal.processing import (
+    BaseDummyInputsBuilder,
+    BaseMultiModalProcessor,
+    PromptReplacement,
+    PromptUpdate,
+    PromptUpdateDetails,
+)
+from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from vllm_omni.model_executor.custom_process_mixin import CustomProcessMixin
+from vllm_omni.model_executor.models.output_templates import OmniOutput
+from vllm_omni.transformers_utils.configs.ming_flash_omni import BailingMM2Config
+from vllm_omni.transformers_utils.processors.ming import (
+    PLACEHOLDER_AUDIO_TOKEN_IN_TEXT,
+    PLACEHOLDER_IMAGE_TOKEN_IN_TEXT,
+    PLACEHOLDER_VIDEO_TOKEN_IN_TEXT,
+    MingFlashOmniProcessor,
+    MingWhisperFeatureExtractor,
+)
+
+from .audio_encoder import WhisperAudioEncoder
+from .modeling_bailing_moe_v2 import BailingMoeV2ForCausalLM
+from .projectors import AudioProjector, VisionProjector
+from .vision_encoder import MingVisionEncoder
+
+logger = init_logger(__name__)
+
+
+class MingAudioInput(TensorSchema):
+    """
+    Dimensions:
+        - b:  Batch size
+        - l:  Total audio frames (clips concatenated along the time axis)
+        - nm: Number of mel bins
+        - N:  Max number of audio clips per batch item
+    """
+
+    audio_feats: Annotated[
+        torch.Tensor,
+        TensorShape("b", "l", "nm"),
+    ]
+
+    audio_feats_lengths: Annotated[
+        torch.Tensor,
+        TensorShape("b", "N"),
+    ]
+
+
+class MingFlashOmniThinkerProcessingInfo(Qwen2VLProcessingInfo):
+    def get_hf_config(self) -> BailingMM2Config:
+        return self.ctx.get_hf_config(BailingMM2Config)
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(MingFlashOmniProcessor, **kwargs)
+
+    def get_target_channels(self) -> int:
+        # See `_normalize_audio_tensor` in vllm_omni/transformers_utils/processors/ming.py
+        return 1
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None, "video": None, "audio": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        mm_counts = mm_counts or {}
+        requested_modalities = {m for m, c in mm_counts.items() if c > 0}
+        mm_max_tokens: dict[str, int] = {}
+
+        if requested_modalities & {"image", "video"}:
+            vl_tokens = super().get_mm_max_tokens_per_item(
+                seq_len=seq_len,
+                mm_counts=mm_counts,
+            )
+            mm_max_tokens.update({m: vl_tokens[m] for m in ["image", "video"] if m in requested_modalities})
+
+        if "audio" in requested_modalities:
+            # TODO: consider computing from audio config
+            mm_max_tokens["audio"] = 3000
+
+        return mm_max_tokens
+
+    def get_feature_extractor(self, **kwargs: object) -> MingWhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor(**kwargs)
+        feature_extractor = hf_processor.audio_processor
+        assert isinstance(feature_extractor, MingWhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_data_parser(self):
+        feature_extractor = self.get_feature_extractor()
+        return MultiModalDataParser(
+            target_sr=feature_extractor.sampling_rate,
+            target_channels=self.get_target_channels(),
+            expected_hidden_size=self._get_expected_hidden_size(),
+        )
+
+
+class MingFlashOmniThinkerDummyInputsBuilder(BaseDummyInputsBuilder[MingFlashOmniThinkerProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        num_audios = mm_counts.get("audio", 0)
+
+        hf_processor = self.info.get_hf_processor()
+
+        audio_token: str = hf_processor.audio_token
+        image_token: str = hf_processor.image_token
+        video_token: str = hf_processor.video_token
+
+        return image_token * num_images + video_token * num_videos + audio_token * num_audios
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        num_audios = mm_counts.get("audio", 0)
+
+        # Default dimensions for dummy data
+        image_width, image_height = 448, 448
+        video_width, video_height = 448, 448
+        num_frames = 8
+        audio_duration = 3.0  # seconds
+        sample_rate = 16000
+
+        audio_length = int(audio_duration * sample_rate)
+
+        mm_data: MultiModalDataDict = {
+            "image": self._get_dummy_images(
+                width=image_width,
+                height=image_height,
+                num_images=num_images,
+            ),
+            "video": self._get_dummy_videos(
+                width=video_width,
+                height=video_height,
+                num_frames=num_frames,
+                num_videos=num_videos,
+            ),
+            "audio": [(np.random.randn(audio_length).astype(np.float32), sample_rate) for _ in range(num_audios)],
+        }
+
+        return mm_data
+
+
+class MingFlashOmniThinkerMultiModalProcessor(BaseMultiModalProcessor[MingFlashOmniThinkerProcessingInfo]):
+    """Multimodal processor for Ming-flash-omni Thinker stage.
+
+    Handles preprocessing of 1) image, 2) video, and 3) audio inputs,
+    and expands placeholder tokens to the correct number of patch tokens.
+    """
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        tokenizer = self.info.get_tokenizer()
+        # might want to add a fallback to resolve token ids
+        # vocab = tokenizer.get_vocab()
+        thinker_config = self.info.get_hf_config()
+
+        # patch/delimiter token IDs (used in replacement sequences)
+        image_start_token_id = thinker_config.llm_config.image_start_token
+        image_patch_token_id = thinker_config.llm_config.image_patch_token
+        image_end_token_id = thinker_config.llm_config.image_end_token
+
+        video_start_token_id = thinker_config.llm_config.video_start_token
+        frame_patch_token_id = thinker_config.llm_config.video_patch_token
+        video_end_token_id = thinker_config.llm_config.video_end_token
+
+        audio_start_token_id = thinker_config.llm_config.audio_start_token
+        audio_patch_token_id = thinker_config.llm_config.audio_patch_token
+        audio_end_token_id = thinker_config.llm_config.audio_end_token
+
+        vision_config = thinker_config.vision_config
+        spatial_merge_size = vision_config.spatial_merge_size if vision_config else 2
+
+        newline_token_ids: list[int] = tokenizer.encode("\n", add_special_tokens=False)
+
+        out_mm_data = out_mm_kwargs.get_data()
+
+        def get_replacement_image(item_idx: int) -> PromptUpdateDetails:
+            """Generate token sequence for an image."""
+            grid_thw = out_mm_data.get("image_grid_thw")
+            if grid_thw is None:
+                raise ValueError(
+                    "image_grid_thw missing from processor output; "
+                    "cannot determine image patch count for prompt replacement."
+                )
+            if isinstance(grid_thw, torch.Tensor):
+                thw = grid_thw[item_idx]
+                num_patches = int(thw.prod().item()) // (spatial_merge_size**2)
+            else:
+                thw = grid_thw[item_idx]
+                num_patches = (thw[0] * thw[1] * thw[2]) // (spatial_merge_size**2)
+
+            # Build token sequence: <image> <imagePatch>*N </image> \n
+            # the newline token is added in purpose from original model processing
+            tokens: list[int] = []
+            tokens.append(image_start_token_id)
+            tokens.extend([image_patch_token_id] * num_patches)
+            tokens.append(image_end_token_id)
+            # Refer to Ming's BailingMM2Processor._expand_image_tokens
+            # https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/processing_bailingmm2.py
+            tokens.extend(newline_token_ids)
+
+            # Only <imagePatch> tokens receive multimodal embeddings
+            return PromptUpdateDetails.select_token_id(tokens, image_patch_token_id)
+
+        def get_replacement_video(item_idx: int) -> PromptUpdateDetails:
+            """Generate token sequence for a video."""
+            grid_thw = out_mm_data.get("video_grid_thw", None)
+            if grid_thw is None:
+                raise ValueError(
+                    "video_grid_thw missing from processor output; "
+                    "cannot determine video patch count for prompt replacement."
+                )
+            if isinstance(grid_thw, torch.Tensor):
+                thw = grid_thw[item_idx]
+                num_patches = int(thw.prod().item()) // (spatial_merge_size**2)
+            else:
+                thw = grid_thw[item_idx]
+                num_patches = (thw[0] * thw[1] * thw[2]) // (spatial_merge_size**2)
+
+            # Build token sequence: <video> <framePatch>*N </video> \n
+            # the newline token is added in purpose from original model processing
+            tokens: list[int] = []
+            tokens.append(video_start_token_id)
+            tokens.extend([frame_patch_token_id] * num_patches)
+            tokens.append(video_end_token_id)
+            tokens.extend(newline_token_ids)
+
+            # Only <framePatch> tokens receive multimodal embeddings
+            return PromptUpdateDetails.select_token_id(tokens, frame_patch_token_id)
+
+        def get_replacement_audio(item_idx: int) -> PromptUpdateDetails:
+            """Generate token sequence for an audio."""
+            encoder_feats_lengths = out_mm_data.get("encoder_feats_lengths", None)
+            if encoder_feats_lengths is None:
+                raise ValueError(
+                    "encoder_feats_lengths missing from processor output; "
+                    "cannot determine audio patch count for prompt replacement."
+                )
+            if isinstance(encoder_feats_lengths, torch.Tensor):
+                num_patches = int(encoder_feats_lengths[item_idx].item())
+            else:
+                num_patches = encoder_feats_lengths[item_idx]
+
+            # Build token sequence: <audio> <audioPatch>*N </audio>
+            tokens: list[int] = []
+            tokens.append(audio_start_token_id)
+            tokens.extend([audio_patch_token_id] * num_patches)
+            tokens.append(audio_end_token_id)
+
+            # Only <audioPatch> tokens receive multimodal embeddings
+            return PromptUpdateDetails.select_token_id(tokens, audio_patch_token_id)
+
+        # Build prompt updates and process replacement
+        updates: list[PromptUpdate] = []
+
+        if "image" in mm_items and mm_items.get_items("image", ImageProcessorItems):
+            updates.append(
+                PromptReplacement(
+                    modality="image",
+                    target=PLACEHOLDER_IMAGE_TOKEN_IN_TEXT,
+                    replacement=get_replacement_image,
+                )
+            )
+        if "video" in mm_items and mm_items.get_items("video", VideoProcessorItems):
+            updates.append(
+                PromptReplacement(
+                    modality="video",
+                    target=PLACEHOLDER_VIDEO_TOKEN_IN_TEXT,
+                    replacement=get_replacement_video,
+                )
+            )
+        if "audio" in mm_items and mm_items.get_items("audio", AudioProcessorItems):
+            updates.append(
+                PromptReplacement(
+                    modality="audio",
+                    target=PLACEHOLDER_AUDIO_TOKEN_IN_TEXT,
+                    replacement=get_replacement_audio,
+                )
+            )
+        return updates
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        config: dict[str, MultiModalFieldConfig] = {}
+
+        # Image fields, pixel_values is flat (concatenated patches from all images)
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        if "pixel_values" in hf_inputs:
+            image_sizes = image_grid_thw.prod(-1)
+            config["pixel_values"] = MultiModalFieldConfig.flat_from_sizes(
+                "image",
+                image_sizes,
+            )
+        if "image_grid_thw" in hf_inputs:
+            config["image_grid_thw"] = MultiModalFieldConfig.batched("image")
+
+        # Video fields, same flat layout as images
+        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+        if "pixel_values_videos" in hf_inputs:
+            video_sizes = video_grid_thw.prod(-1)
+            config["pixel_values_videos"] = MultiModalFieldConfig.flat_from_sizes(
+                "video",
+                video_sizes,
+            )
+        if "video_grid_thw" in hf_inputs:
+            config["video_grid_thw"] = MultiModalFieldConfig.batched("video")
+
+        # Audio fields
+        if "audio_feats" in hf_inputs:
+            config["audio_feats"] = MultiModalFieldConfig.batched("audio")
+        if "audio_feats_lengths" in hf_inputs:
+            config["audio_feats_lengths"] = MultiModalFieldConfig.batched("audio")
+        if "encoder_feats_lengths" in hf_inputs:
+            config["encoder_feats_lengths"] = MultiModalFieldConfig.batched("audio")
+        if "placeholder_audio_loc_lens" in hf_inputs:
+            config["placeholder_audio_loc_lens"] = MultiModalFieldConfig.batched("audio")
+
+        return config
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        """Call sub-processors for multimodal inputs and tokenize.
+
+        We call the image/audio sub-processors directly (instead of going
+        through `MingFlashOmniProcessor.__call__`) so that the high-level
+        placeholder tokens remain **unexpanded** in the tokenized output.
+        """
+        hf_processor = self.info.get_hf_processor()
+        tokenizer = self.info.get_tokenizer()
+
+        data: dict[str, object] = {}
+
+        images = mm_data.get("images", None)
+        if images is not None:
+            image_outputs = hf_processor.image_processor(
+                images=images,
+                videos=None,
+                return_tensors="pt",
+            )
+            data.update(image_outputs)
+
+        videos = mm_data.get("videos", None)
+        if videos is not None:
+            video_outputs = hf_processor.image_processor(
+                images=None,
+                videos=videos,
+                return_tensors="pt",
+            )
+            # Rename keys to distinguish from images
+            if "pixel_values" in video_outputs:
+                video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values")
+            if "image_grid_thw" in video_outputs:
+                video_outputs["video_grid_thw"] = video_outputs.pop("image_grid_thw")
+            data.update(video_outputs)
+
+        audios = mm_data.get("audios", None)
+        if audios is not None:
+            # vLLM's AudioProcessorItems provides raw numpy arrays (already resampled).
+            # MingWhisperAudioProcessor expects (waveform, sr) tuples,
+            # so wrap them with the target sample rate.
+            target_sr = hf_processor.audio_processor.sampling_rate
+            audio_tuples = [(a, target_sr) if not isinstance(a, tuple) else a for a in audios]
+
+            audio_outputs = hf_processor.audio_processor(
+                audio_tuples,
+                return_tensors="pt",
+            )
+            data.update(audio_outputs)
+
+        # Tokenize text with placeholders still intact
+        text_outputs = tokenizer(prompt, return_tensors="pt", **tok_kwargs)
+        data.update(text_outputs)
+
+        return BatchFeature(data=data)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    MingFlashOmniThinkerMultiModalProcessor,
+    info=MingFlashOmniThinkerProcessingInfo,
+    dummy_inputs=MingFlashOmniThinkerDummyInputsBuilder,
+)
+class MingFlashOmniThinkerForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsMRoPE,
+    CustomProcessMixin,
+):
+    """Ming Thinker stage: multimodal understanding
+    (text + image + video + audio) -> text generation.
+    """
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"model.": "language_model."},
+    )
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        # vllm_omni/transformers_utils/processors/ming.py
+        if modality.startswith("image"):
+            return "<IMAGE>"
+        elif modality.startswith("video"):
+            return "<VIDEO>"
+        elif modality.startswith("audio"):
+            return "<AUDIO>"
+
+        raise ValueError("Only image, video, or audio modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+
+        thinker_config: BailingMM2Config = config
+        if (
+            thinker_config.llm_config is None
+            or thinker_config.vision_config is None
+            or thinker_config.audio_config is None
+        ):
+            raise ValueError(
+                "MingFlashOmniThinker requires `llm_config`, `vision_config`, and `audio_config` in `thinker_config`."
+            )
+
+        llm_config = thinker_config.llm_config
+
+        self.config = llm_config
+        self.thinker_config = thinker_config
+        self.have_multimodal_outputs = True
+
+        # Initialize LLM as a component
+        with self._mark_language_model(vllm_config):
+            llm_vllm_config = vllm_config.with_hf_config(llm_config)
+            self.language_model = BailingMoeV2ForCausalLM(
+                vllm_config=llm_vllm_config, prefix=maybe_prefix(prefix, "llm")
+            )
+
+        # Ming thinker is inherently multimodal; initialize both towers eagerly.
+        with self._mark_tower_model(vllm_config, {"image", "video"}):
+            self.vision = MingVisionEncoder(
+                vision_config=thinker_config.vision_config,
+                quant_config=vllm_config.quant_config,
+                prefix=maybe_prefix(prefix, "vision"),
+            )
+            self.linear_proj = VisionProjector(
+                vision_dim=self.vision.image_emb_dim,
+                llm_dim=llm_config.hidden_size,
+                mlp_depth=getattr(thinker_config, "mlp_depth", 2),
+            )
+        logger.info("Initialized MingVisionEncoder and VisionProjector")
+
+        audio_cfg = thinker_config.audio_config
+        whisper_cfg = getattr(audio_cfg, "whisper_encoder_config", {}) or {}
+        with self._mark_tower_model(vllm_config, "audio"):
+            self.audio = WhisperAudioEncoder(
+                **whisper_cfg,
+                use_flash_attn=True,
+            )
+            self.linear_proj_audio = AudioProjector(
+                audio_dim=self.audio.audio_emb_dim,
+                llm_dim=llm_config.hidden_size,
+                ds_kernel_size=getattr(audio_cfg, "ds_kernel_size", 3),
+                ds_stride=getattr(audio_cfg, "ds_stride", 2),
+                mlp_depth=getattr(thinker_config, "mlp_depth", 1),
+            )
+        logger.info("Initialized WhisperAudioEncoder and AudioProjector")
+
+        # Expose interfaces
+        self.make_empty_intermediate_tensors = self.language_model.make_empty_intermediate_tensors
+
+        logger.info("MingFlashOmniThinker initialized with vision and audio towers")
+
+    def extract_image_feature(self, pixel_values: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        """Extract and project image features.
+
+        Args:
+            pixel_values: Flattened pixel values from vision processor.
+            grid_thw: [num_images, 3] tensor of (t, h, w) grid dimensions.
+
+        Returns:
+            [seq_len, hidden_size] L2-normalized image embeddings.
+        """
+        if self.vision is None:
+            raise ValueError("Vision encoder not initialized")
+
+        with torch.amp.autocast(pixel_values.device.type, dtype=torch.bfloat16):
+            image_embeds = self.vision(pixel_values, grid_thw=grid_thw)
+
+        if self.vision.use_deepstack:
+            image_embeds = image_embeds[:, : self.vision.image_emb_dim]
+
+        image_embeds = self.linear_proj(image_embeds)
+        image_embeds = F.normalize(image_embeds, dim=-1)
+        return image_embeds
+
+    def extract_audio_feature(
+        self, audio_feats: torch.Tensor, audio_feats_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, ...]:
+        """Extract and project audio features.
+
+        Args:
+            audio_feats: [B, L_total, n_mels] wrapped mel features — multiple audio
+                clips per batch item are concatenated along the time dimension
+                (time-first, as produced by MingWhisperFeatureExtractor).
+            audio_feats_lengths: [B, N] lengths of each audio clip per batch item.
+                N is the max number of clips per item; zero-padded entries are skipped.
+
+        Returns:
+            Tuple of per-clip [T'_i, hidden_size] projected audio embeddings.
+        """
+        if self.audio is None:
+            raise ValueError("Audio encoder not initialized")
+
+        # Unwrap packed [B, L_total, n_mels] into a list of [n_mels, T'_i] tensors,
+        # one per audio clip, as expected by WhisperAudioEncoder.
+        x_list: list[torch.Tensor] = []
+        audio_lens: list[int] = []
+        for i in range(audio_feats_lengths.shape[0]):
+            feat_index = 0
+            for j in range(audio_feats_lengths.shape[1]):
+                feat_len = int(audio_feats_lengths[i, j].item())
+                if feat_len == 0:
+                    break
+                mel_seg = audio_feats[i, feat_index : feat_index + feat_len].transpose(0, 1)
+                x_list.append(mel_seg)
+                audio_lens.append(feat_len)
+                feat_index += feat_len
+
+        audio_packed = self.audio(x_list, audio_lens)
+
+        # Compute per-clip lengths after Whisper Conv1d (kernel=3, stride=2, pad=1)
+        encoded_lens = [(audio_len - 3 + 2) // 2 + 1 for audio_len in audio_lens]
+
+        # Project packed
+        proj_packed, proj_lens = self.linear_proj_audio.forward_packed(audio_packed, encoded_lens)
+
+        normalize = getattr(self.thinker_config.audio_config, "norm_query_embeds", False)
+        if normalize:
+            proj_packed = F.normalize(proj_packed, dim=-1)
+
+        proj_packed = proj_packed.to(audio_feats.dtype)
+
+        # Split into per-clip tensors
+        return proj_packed.split(proj_lens)
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        """Parse and validate multimodal kwargs into per-modality dicts."""
+        mm_input_by_modality: dict[str, Qwen2_5_VLImageInputs | Qwen2_5_VLVideoInputs | MingAudioInput] = {}
+
+        for key in kwargs:
+            if key == "pixel_values" and "image" not in mm_input_by_modality:
+                pixel_values = kwargs.get("pixel_values")
+                image_grid_thw = kwargs.get("image_grid_thw")
+                if pixel_values is not None and image_grid_thw is not None:
+                    mm_input_by_modality["image"] = Qwen2_5_VLImagePixelInputs(
+                        type="pixel_values",
+                        pixel_values=pixel_values,  # type: ignore[arg-type]
+                        image_grid_thw=image_grid_thw,  # type: ignore[arg-type]
+                    )
+            elif key == "pixel_values_videos" and "video" not in mm_input_by_modality:
+                pixel_values_videos = kwargs.get("pixel_values_videos")
+                video_grid_thw = kwargs.get("video_grid_thw")
+                second_per_grid_ts = kwargs.get("second_per_grid_ts")
+                if pixel_values_videos is not None and video_grid_thw is not None:
+                    mm_input_by_modality["video"] = Qwen2_5_VLVideoPixelInputs(
+                        type="pixel_values_videos",
+                        pixel_values_videos=pixel_values_videos,  # type: ignore[arg-type]
+                        video_grid_thw=video_grid_thw,  # type: ignore[arg-type]
+                        second_per_grid_ts=second_per_grid_ts,  # type: ignore[arg-type]
+                    )
+            elif key == "audio_feats" and "audio" not in mm_input_by_modality:
+                audio_feats = kwargs.get("audio_feats")
+                audio_feats_lengths = kwargs.get("audio_feats_lengths")
+                if audio_feats is not None and audio_feats_lengths is not None:
+                    mm_input_by_modality["audio"] = MingAudioInput(
+                        audio_feats=audio_feats,  # type: ignore[arg-type]
+                        audio_feats_lengths=audio_feats_lengths,  # type: ignore[arg-type]
+                    )
+
+        return mm_input_by_modality
+
+    def _process_image_input(self, image_input: Qwen2_5_VLImageInputs) -> list[torch.Tensor]:
+        # Splits the flat [total_tokens, D] output of extract_image_feature
+        # into one tensor per image.
+        pixel_values = image_input["pixel_values"]
+        image_grid_thw = image_input["image_grid_thw"]
+        image_embeds = self.extract_image_feature(pixel_values, image_grid_thw)
+        merge_unit = self.thinker_config.vision_config.spatial_merge_size**2
+        sizes = (image_grid_thw.prod(dim=-1) // merge_unit).tolist()
+        return list(image_embeds.split([int(s) for s in sizes], dim=0))
+
+    def _process_video_input(self, video_input: Qwen2_5_VLVideoInputs) -> list[torch.Tensor]:
+        pixel_values_videos = video_input["pixel_values_videos"]
+        video_grid_thw = video_input["video_grid_thw"]
+        video_embeds = self.extract_image_feature(pixel_values_videos, video_grid_thw)
+        merge_unit = self.thinker_config.vision_config.spatial_merge_size**2
+        sizes = (video_grid_thw.prod(dim=-1) // merge_unit).tolist()
+        return list(video_embeds.split([int(s) for s in sizes], dim=0))
+
+    def _process_audio_input(self, audio_input: MingAudioInput) -> list[torch.Tensor]:
+        return list(self.extract_audio_feature(audio_input["audio_feats"], audio_input["audio_feats_lengths"]))
+
+    def _compute_modality_masks(self, input_ids: torch.Tensor) -> tuple[torch.Tensor | None, torch.Tensor | None]:
+        """Compute vision and audio MoE-routing masks from input_ids.
+
+        Returns:
+            Tuple of (vision_mask, audio_mask), each shape [seq_len] bool.
+        """
+        llm_config = self.config
+
+        # vision mask
+        vision_mask = torch.zeros_like(input_ids, dtype=torch.bool)
+        image_token = llm_config.image_patch_token
+        video_token = llm_config.video_patch_token
+        vision_mask = vision_mask | (input_ids == image_token)
+        vision_mask = vision_mask | (input_ids == video_token)
+
+        # audio mask
+        audio_token = llm_config.audio_patch_token
+        audio_mask = input_ids == audio_token
+
+        return vision_mask, audio_mask
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not mm_input_by_modality:
+            return []
+
+        # preserve the order of modalities
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        for modality, mm_input in mm_input_by_modality.items():
+            if modality == "image":
+                multimodal_embeddings += tuple(self._process_image_input(mm_input))  # type: ignore[arg-type]
+            elif modality == "video":
+                multimodal_embeddings += tuple(self._process_video_input(mm_input))  # type: ignore[arg-type]
+            elif modality == "audio":
+                multimodal_embeddings += tuple(self._process_audio_input(mm_input))  # type: ignore[arg-type]
+
+        return multimodal_embeddings
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = False,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.model.word_embeddings(input_ids)
+
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        assert is_multimodal is not None, "`is_multimodal` mask required when `multimodal_embeddings` provided"
+        return _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> OmniOutput:
+        # Compute MoE modality masks on every device
+        image_mask, audio_mask = self._compute_modality_masks(input_ids)
+        hidden_states = self.language_model.forward(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            image_mask=image_mask,
+            audio_mask=audio_mask,
+        )
+
+        # Capture embeddings for downstream stages
+        multimodal_outputs = {
+            "final_hidden_states": hidden_states,
+        }
+
+        return OmniOutput(
+            text_hidden_states=hidden_states,
+            multimodal_outputs=multimodal_outputs,
+        )
+
+    def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states, sampling_metadata)
+
+    def sample(self, logits: torch.Tensor, sampling_metadata):
+        return self.language_model.sample(logits, sampling_metadata)
+
+    @property
+    def sampler(self):
+        return self.language_model.sampler
+
+    def iter_mm_features(
+        self,
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> Iterator[tuple[int, str, dict[str, Any]]]:
+        """Iterate over image/video features sorted by token position.
+
+        Yields: (offset, modality, feature_data) where feature_data contains:
+        - image: {"grid_t", "grid_h", "grid_w", "second_per_grid_t"}
+        - video: {"grid_t", "grid_h", "grid_w", "second_per_grid_t"}
+
+        Audio features are not yielded: Ming assigns them sequential
+        text positions (same T/H/W value) rather than 3D grid positions.
+        """
+        spatial_merge_size = self.config.spatial_merge_size
+
+        for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset):
+            if mm_feature.data is None:
+                continue
+
+            offset = mm_feature.mm_position.offset
+            modality = mm_feature.modality
+
+            if modality == "image":
+                t, h, w = mm_feature.data["image_grid_thw"].data.tolist()
+                yield (
+                    offset,
+                    "image",
+                    {
+                        "grid_t": int(t),
+                        "grid_h": int(h) // spatial_merge_size,
+                        "grid_w": int(w) // spatial_merge_size,
+                        "second_per_grid_t": 0.0,
+                    },
+                )
+            elif modality == "video":
+                t, h, w = mm_feature.data["video_grid_thw"].data.tolist()
+                second_per_grid_t = 1.0
+                spgt_field = mm_feature.data.get("second_per_grid_ts")
+                if spgt_field is not None:
+                    second_per_grid_t = float(spgt_field.data.item())
+                yield (
+                    offset,
+                    "video",
+                    {
+                        "grid_t": int(t),
+                        "grid_h": int(h) // spatial_merge_size,
+                        "grid_w": int(w) // spatial_merge_size,
+                        "second_per_grid_t": second_per_grid_t,
+                    },
+                )
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec] | None = None,
+        **kwargs: object,
+    ) -> tuple[torch.Tensor, int]:
+        """Compute M-RoPE input positions using mm_features directly."""
+        llm_config = self.config
+        tokens_per_second: int = getattr(llm_config, "tokens_per_second", 2)
+        seq_len = len(input_tokens)
+
+        llm_pos_ids_list: list[np.ndarray] = []
+        st = 0  # index of next unprocessed token
+
+        for patch_offset, _modality, data in self.iter_mm_features(mm_features or []):
+            text_len = patch_offset - st
+            st_idx = int(llm_pos_ids_list[-1].max()) + 1 if llm_pos_ids_list else 0
+            if text_len > 0:
+                llm_pos_ids_list.append(np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx)
+                st_idx += text_len
+
+            # 3-D grid positions for patch tokens
+            grid_t: int = data["grid_t"]
+            grid_h: int = data["grid_h"]
+            grid_w: int = data["grid_w"]
+            second_per_grid_t: float = data["second_per_grid_t"]
+
+            t_raw = np.arange(grid_t)
+            if second_per_grid_t > 0:
+                t_index = (t_raw * second_per_grid_t * tokens_per_second).astype(np.int64)
+            else:
+                t_index = t_raw.astype(np.int64)
+            t_index = np.repeat(t_index, grid_h * grid_w)
+
+            h_index = np.tile(np.arange(grid_h).repeat(grid_w), grid_t)
+            w_index = np.tile(np.arange(grid_w), grid_t * grid_h)
+
+            llm_pos_ids_list.append(np.stack([t_index, h_index, w_index]) + st_idx)
+
+            num_patches = grid_t * grid_h * grid_w
+            st = patch_offset + num_patches
+
+        if st < seq_len:
+            st_idx = int(llm_pos_ids_list[-1].max()) + 1 if llm_pos_ids_list else 0
+            tail_len = seq_len - st
+            llm_pos_ids_list.append(np.broadcast_to(np.arange(tail_len), (3, tail_len)) + st_idx)
+
+        if llm_pos_ids_list:
+            position_ids = torch.from_numpy(np.concatenate(llm_pos_ids_list, axis=1).astype(np.int64))  # (3, seq_len)
+        else:
+            # text-only, simple sequential positions
+            position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(3, -1)
+
+        mrope_position_delta = int(position_ids.max().item()) + 1 - seq_len
+        return position_ids, mrope_position_delta
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py b/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py
new file mode 100644
index 00000000000..1ff362c5b9d
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_flash_omni/modeling_bailing_moe_v2.py
@@ -0,0 +1,896 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+# Copyright 2023 Antgroup and The HuggingFace Inc. team. All rights reserved.
+# Adapted from Ming
+# https://github.com/inclusionAI/Ming/blob/2a0c02ae3130190160c215f89fce7de3005db483/modeling_bailing_moe_v2.py
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Iterable
+
+import torch
+from torch import nn
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.config.cache import CacheConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding.mrope import MRotaryEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, VocabParallelEmbedding
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import (
+    PPMissingLayer,
+    WeightsMapper,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.sequence import IntermediateTensors
+from vllm.v1.outputs import SamplerOutput
+from vllm.v1.sample.sampler import Sampler
+
+from vllm_omni.model_executor.custom_process_mixin import CustomProcessMixin
+from vllm_omni.transformers_utils.configs.ming_flash_omni import BailingMoeV2Config
+
+logger = init_logger(__name__)
+
+
+class MingVideoRopeMRotaryEmbedding(MRotaryEmbedding):
+    """MRotaryEmbedding with Ming's video_rope cos/sin interleaving.
+
+    Unlike standard mrope which maps contiguous frequency sections to T/H/W,
+    video_rope alternates H/W frequencies element-wise in the spatial section
+    and places temporal frequencies at the end:
+        Standard mrope:  [T T T ... H H H ... W W W ...]
+        Video rope:      [H W H W ... H W ... T T T ...]
+
+    Refer to Ming's BailingMoeV2RotaryEmbedding3D
+    https://github.com/inclusionAI/Ming/blob/2a0c02ae3130190160c215f89fce7de3005db483/modeling_bailing_moe_v2.py#L174
+    """
+
+    def _remap_video_rope(
+        self,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Remap 3D cos/sin to video_rope interleaved layout.
+
+        Args:
+            cos, sin: [3, num_tokens, rotary_dim // 2]
+        Returns:
+            cos, sin: [num_tokens, rotary_dim // 2]
+
+        Refer to Ming's apply_3d_rotary_pos_emb
+        https://github.com/inclusionAI/Ming/blob/2a0c02ae3130190160c215f89fce7de3005db483/modeling_bailing_moe_v2.py#L226
+        """
+        assert self.mrope_section is not None
+        hw_size = self.mrope_section[1] + self.mrope_section[2]
+
+        result_cos = torch.empty_like(cos[0])
+        result_sin = torch.empty_like(sin[0])
+
+        # Spatial frequencies: even indices from H (dim 1), odd from W (dim 2)
+        result_cos[:, 0:hw_size:2] = cos[1, :, 0:hw_size:2]
+        result_cos[:, 1:hw_size:2] = cos[2, :, 1:hw_size:2]
+        result_sin[:, 0:hw_size:2] = sin[1, :, 0:hw_size:2]
+        result_sin[:, 1:hw_size:2] = sin[2, :, 1:hw_size:2]
+
+        # Temporal frequencies at the end
+        result_cos[:, hw_size:] = cos[0, :, hw_size:]
+        result_sin[:, hw_size:] = sin[0, :, hw_size:]
+
+        return result_cos, result_sin
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        cos_sin_cache = self._match_cos_sin_cache_dtype(query)
+        num_tokens = positions.shape[-1]
+        cos_sin = cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        if positions.ndim == 2:
+            cos, sin = self._remap_video_rope(cos, sin)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self.apply_rotary_emb.forward_native(query_rot, cos, sin)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = self.apply_rotary_emb.forward_native(key_rot, cos, sin)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # No custom Triton kernel for video_rope; fall back to native for 3D
+        # TODO: Consider custom optimization
+        if positions.ndim == 2:
+            return self.forward_native(positions, query, key, offsets)
+        return super().forward_cuda(positions, query, key, offsets)
+
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return self.forward_native(positions, query, key, offsets)
+
+
+class BailingMoeV2MLP(nn.Module):
+    def __init__(
+        self,
+        config: BailingMoeV2Config,
+        intermediate_size: int,
+        hidden_act: str = "silu",
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BailingMoeV2Gate(nn.Module):
+    """MoE routing gate with grouped expert selection."""
+
+    def __init__(
+        self,
+        config: BailingMoeV2Config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.num_experts = config.num_experts
+
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+
+        self.gating_dim = config.hidden_size
+
+        self.gate = ReplicatedLinear(
+            self.gating_dim,
+            self.num_experts,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        self.expert_bias = nn.Parameter(torch.zeros(self.num_experts), requires_grad=False)
+
+    def group_limited_topk(self, scores: torch.Tensor):
+        """Group-limited top-k selection for expert routing."""
+        num_tokens, _ = scores.size()
+        # Organize experts into groups
+        group_scores = scores.view(num_tokens, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
+        group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+        group_mask = torch.zeros_like(group_scores)
+        group_mask.scatter_(1, group_idx, 1)
+
+        # Mask experts based on selected groups
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(num_tokens, self.n_group, self.num_experts // self.n_group)
+            .reshape(num_tokens, -1)
+        )
+
+        masked_scores = scores.masked_fill(~score_mask.bool(), float("-inf"))
+        probs, top_indices = torch.topk(masked_scores, k=self.top_k, dim=-1, sorted=False)
+
+        return probs, top_indices
+
+    def forward(self, hidden_states):
+        # compute gating score
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        logits, _ = self.gate(hidden_states)
+
+        logits = logits.float()
+        scores = torch.sigmoid(logits)
+
+        scores_for_routing = scores + self.expert_bias
+        _, topk_idx = self.group_limited_topk(scores_for_routing)
+
+        scores = torch.gather(scores, dim=1, index=topk_idx).type_as(logits)
+
+        topk_weight = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if self.top_k > 1 else scores
+        topk_weight = topk_weight * self.routed_scaling_factor
+
+        return topk_idx, topk_weight, logits
+
+
+def _unpack_multi_routing(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Stateless routing function that unpacks pre-computed routing results.
+
+    Used as `custom_routing_function` for `FusedMoE`. The caller is expected
+    to pack (topk_weight, topk_idx) into `gating_output` before
+    calling FusedMoE.forward(), and this function unpacks them.
+
+    Args:
+        gating_output: [num_tokens, top_k * 2]
+            - [:, :top_k], topk_weight (float)
+            - [:, top_k:], topk_idx   (float, cast back to int)
+    """
+    topk_weight = gating_output[:, :topk].contiguous()
+    topk_idx = gating_output[:, topk:]
+    return topk_weight.to(torch.float32), topk_idx.to(torch.int32)
+
+
+class BailingMoeV2SparseMoeBlock(nn.Module):
+    """Sparse MoE block with MultiRouter support for multimodal routing.
+
+    Keep the custom multi-router gating logic external.
+    """
+
+    def __init__(
+        self,
+        config: BailingMoeV2Config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        if isinstance(self.config.num_shared_experts, int) and self.config.num_shared_experts > 0:
+            self.shared_experts = BailingMoeV2MLP(
+                config=self.config,
+                intermediate_size=self.config.moe_intermediate_size * self.config.num_shared_experts,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+        else:
+            self.shared_experts = None
+
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            custom_routing_function=_unpack_multi_routing,
+            renormalize=False,  # we handle normalization in the gate
+            reduce_results=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
+
+        self.experts.expert_mapping = FusedMoE.make_expert_params_mapping(
+            self.experts,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=config.num_experts,
+        )
+
+        self.router_type = self.config.router_type
+        if self.router_type == "topN":
+            self.gate = BailingMoeV2Gate(self.config, quant_config, prefix=f"{prefix}.gate")
+        elif self.router_type == "MultiRouter":
+            self.gate = BailingMoeV2Gate(self.config, quant_config, prefix=f"{prefix}.gate")
+            self.image_gate = BailingMoeV2Gate(self.config, quant_config, prefix=f"{prefix}.image_gate")
+            self.audio_gate = BailingMoeV2Gate(self.config, quant_config, prefix=f"{prefix}.audio_gate")
+        else:
+            raise ValueError(f"Unsupported router_type: {self.router_type}")
+
+    @staticmethod
+    def _normalize_mask(
+        mask: torch.Tensor,
+        bsz: int,
+        seq_len: int,
+        name: str,
+    ) -> torch.Tensor:
+        """Validate and reshape a modality mask to [bsz*seq_len, 1] bool."""
+        N = bsz * seq_len
+        if mask.ndim == 1:
+            # vLLM path: flat tokens [N]
+            assert mask.shape[0] == N, f"{name} length {mask.shape[0]} != N={N}"
+        elif mask.ndim == 2:
+            assert mask.shape == (bsz, seq_len), f"{name} shape {mask.shape} != ({bsz}, {seq_len})"
+        elif mask.ndim == 3:
+            assert mask.shape == (bsz, seq_len, 1), f"{name} shape {mask.shape} != ({bsz}, {seq_len}, 1)"
+        else:
+            raise ValueError(f"Unsupported {name} shape: {mask.shape}")
+
+        return mask.reshape(N, 1).bool()
+
+    def forward(self, hidden_states, image_mask: torch.Tensor, audio_mask: torch.Tensor):
+        # TODO(yuanheng-zhao): revise the shapes in the flow
+        assert 2 <= hidden_states.dim() <= 3, f"{self.__class__.__name__} only supports 2D or 3D inputs"
+        input_is_2d = hidden_states.ndim == 2
+        if input_is_2d:
+            hidden_states = hidden_states.unsqueeze(0)
+
+        bsz, seq_len, h = hidden_states.shape
+
+        if self.router_type == "MultiRouter":
+            image_mask = self._normalize_mask(image_mask, bsz, seq_len, "image_mask").to(hidden_states.device)
+            audio_mask = self._normalize_mask(audio_mask, bsz, seq_len, "audio_mask").to(hidden_states.device)
+
+            # if image_mask is not None and audio_mask is not None:
+            #     assert torch.logical_and(image_mask, audio_mask).sum() == 0
+
+            image_topk_idx, image_topk_weight, _ = self.image_gate(hidden_states)
+            audio_topk_idx, audio_topk_weight, _ = self.audio_gate(hidden_states)
+            topk_idx, topk_weight, _ = self.gate(hidden_states)
+
+            topk_idx = torch.where(image_mask, image_topk_idx, topk_idx)
+            topk_weight = torch.where(image_mask, image_topk_weight, topk_weight)
+            topk_idx = torch.where(audio_mask, audio_topk_idx, topk_idx)
+            topk_weight = torch.where(audio_mask, audio_topk_weight, topk_weight)
+        else:
+            topk_idx, topk_weight, _ = self.gate(hidden_states)
+
+        # Pack pre-computed routing into a single tensor
+        packed_routing = torch.cat(
+            [
+                topk_weight.to(hidden_states.dtype),
+                topk_idx.to(hidden_states.dtype),
+            ],
+            dim=-1,
+        )
+
+        # SharedFusedMoE expects 2D hidden_states
+        hidden_states_2d = hidden_states.view(-1, h)
+        result = self.experts(hidden_states_2d, packed_routing)
+
+        if self.shared_experts is not None:
+            shared_output, fused_out = result
+        else:
+            shared_output, fused_out = None, result
+
+        final_hidden_states = fused_out + shared_output if shared_output is not None else fused_out
+
+        final_hidden_states = final_hidden_states.view(bsz, seq_len, h)
+
+        return final_hidden_states.squeeze(0) if input_is_2d else final_hidden_states
+
+
+class BailingMoeV2Attention(nn.Module):
+    """Multi-headed attention using vLLM's Attention layer with 3D RoPE support."""
+
+    def __init__(
+        self,
+        config: BailingMoeV2Config,
+        layer_idx: int,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.head_dim = config.head_dim
+
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.num_heads % tp_size == 0
+        self.num_heads = self.num_heads // tp_size
+        self.num_kv_heads = max(1, self.num_kv_heads // tp_size)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        partial_rotary_factor = config.partial_rotary_factor
+        self.rope_dim = int(self.head_dim * partial_rotary_factor)
+
+        total_num_heads = config.num_attention_heads
+        total_num_kv_heads = config.num_key_value_heads
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            total_num_heads,
+            total_num_kv_heads,
+            bias=config.use_qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.dense = RowParallelLinear(
+            total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+
+        # apply vLLM RMSNorm here rather than BailingMoeV2RMSNorm, diff might exist
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        # 3D Rotary embeddings for multimodal
+        if config.rope_scaling is None:
+            raise ValueError("rope_scaling must not be None")
+
+        rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        mrope_section = config.rope_scaling.get("mrope_section", [8, 12, 12])
+
+        if rope_type == "video_rope":
+            # Ming-specific video_rope with custom H/W interleaving
+            self.rotary_emb = MingVideoRopeMRotaryEmbedding(
+                head_size=self.head_dim,
+                rotary_dim=self.rope_dim,
+                max_position_embeddings=config.max_position_embeddings,
+                base=config.rope_theta,
+                is_neox_style=True,
+                dtype=torch.get_default_dtype(),
+                mrope_section=mrope_section,
+            )
+        else:
+            # Standard m_rope (rope_type "3D", "default", or None)
+            rope_scaling = dict(config.rope_scaling)
+            rope_scaling["rope_type"] = "default"  # normalize for get_rope dispatch
+            rope_scaling["mrope_section"] = mrope_section
+            self.rotary_emb = get_rope(
+                head_size=self.head_dim,
+                max_position=config.max_position_embeddings,
+                is_neox_style=True,
+                rope_parameters={
+                    "rope_theta": config.rope_theta,
+                    "partial_rotary_factor": config.partial_rotary_factor,
+                    **rope_scaling,
+                },
+            )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """Forward pass for attention with 3D RoPE.
+
+        Args:
+            positions: Position IDs, shape (3, num_tokens) for 3D rope
+                or (num_tokens,) for text-only
+            hidden_states: Input hidden states, shape (num_tokens, hidden_size)
+
+        Returns:
+            Attention output tensor, shape (num_tokens, hidden_size)
+        """
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        num_tokens = q.shape[0]
+        q = self.q_norm(q.view(num_tokens, self.num_heads, self.head_dim)).view(num_tokens, self.q_size)
+        k = self.k_norm(k.view(num_tokens, self.num_kv_heads, self.head_dim)).view(num_tokens, self.kv_size)
+
+        q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v)
+
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class BailingMoeV2DecoderLayer(nn.Module):
+    """Decoder layer with attention and MoE MLP."""
+
+    def __init__(
+        self,
+        config: BailingMoeV2Config,
+        layer_idx: int,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+
+        self.attention = BailingMoeV2Attention(
+            config=config,
+            layer_idx=layer_idx,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attention",
+        )
+
+        # MLP or MoE based on layer index
+        if config.num_experts is not None and layer_idx >= config.first_k_dense_replace:
+            self.mlp = BailingMoeV2SparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+            self.is_moe = True
+        else:
+            self.mlp = BailingMoeV2MLP(
+                config=config,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+            self.is_moe = False
+
+        # apply vLLM RMSNorm to replace BailingMoeV2RMSNorm, diff might exist
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        image_mask: torch.Tensor | None = None,
+        audio_mask: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass for decoder layer.
+
+        Args:
+            positions: Position IDs
+            hidden_states: Input hidden states
+            residual: Residual connection from previous layer
+            image_mask: Mask for image tokens (for MultiRouter MoE)
+            audio_mask: Mask for audio tokens (for MultiRouter MoE)
+
+        Returns:
+            Tuple of (hidden_states, residual)
+        """
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+
+        if self.is_moe:
+            hidden_states = self.mlp(hidden_states, image_mask, audio_mask)
+        else:
+            # Dense MLP only takes hidden_states (no routing masks)
+            hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+        "image_mask": 0,
+        "audio_mask": 0,
+    }
+)
+class BailingMoeV2Model(nn.Module):
+    """BailingMoeV2 Model adapted from:
+
+    Ming repo BailingMoeV2Model
+    https://github.com/inclusionAI/Ming/blob/2a0c02ae3130190160c215f89fce7de3005db483/modeling_bailing_moe_v2.py
+    vLLM repo BailingMoeModel
+    https://github.com/vllm-project/vllm/blob/7291d1b288558d48508e1a17c37b0aa170332264/vllm/model_executor/models/bailing_moe.py
+    """
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        # BailingMoeV2Config
+        config = vllm_config.model_config.hf_text_config
+
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
+
+        if get_pp_group().is_first_rank or (self.tie_word_embeddings and get_pp_group().is_last_rank):
+            self.word_embeddings = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.word_embeddings",
+            )
+        else:
+            self.word_embeddings = PPMissingLayer()
+
+        # Decoder layers with later pipeline parallelism support
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: BailingMoeV2DecoderLayer(
+                config=config,
+                layer_idx=int(prefix.split(".")[-1]),
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            # apply vLLM RMSNorm to replace BailingMoeV2RMSNorm, diff might exist
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        image_mask: torch.Tensor | None = None,
+        audio_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.word_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in self.layers[self.start_layer : self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+                image_mask=image_mask,
+                audio_mask=audio_mask,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states, "residual": residual})
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class BailingMoeV2ForCausalLM(nn.Module, CustomProcessMixin):
+    """BailingMoeV2 model for causal language modeling, adapted for vLLM.
+
+    Inherits from CustomProcessMixin to support custom preprocessing and postprocessing
+    for integration with omni model pipelines.
+    """
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        # BailingMoeV2Config
+        config = vllm_config.model_config.hf_text_config
+        quant_config = vllm_config.quant_config
+
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = BailingMoeV2Model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+
+        self.tie_word_embeddings = getattr(config, "tie_word_embeddings", False)
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if self.tie_word_embeddings:
+                self.lm_head.weight = self.model.word_embeddings.weight
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.sampler = Sampler()
+        self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        image_mask: torch.Tensor | None = None,
+        audio_mask: torch.Tensor | None = None,
+    ):
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+            image_mask=image_mask,
+            audio_mask=audio_mask,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata,
+    ) -> SamplerOutput | None:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, weight_name, shard_id)
+            # BailingMoE stores fused QKV in checkpoint as query_key_value
+            ("qkv_proj", "query_key_value", None),
+            # Dense MLP and shared_experts gate/up are stored separately
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Gate router linear layers: checkpoint `{r}.weight` -> model `{r}.gate.weight`
+        gate_name_mapper = WeightsMapper(
+            orig_to_new_substr={f".{r}.weight": f".{r}.gate.weight" for r in ("gate", "image_gate", "audio_gate")}
+        )
+
+        # FusedMoE expert params mapping is identical across all MoE layers
+        expert_params_mapping: list[tuple[str, str, int, str]] = []
+        for layer in self.model.layers:
+            if hasattr(layer, "mlp") and hasattr(layer.mlp, "experts"):
+                expert_params_mapping = layer.mlp.experts.expert_mapping
+                break
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in gate_name_mapper.apply(weights):
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict.get(name)
+                if param is not None:
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    loaded_params.add(name)
+                break
+            else:
+                for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict.get(name)
+                    if param is not None:
+                        weight_loader = param.weight_loader
+                        weight_loader(param, loaded_weight, name, shard_id=shard_id, expert_id=expert_id)
+                        loaded_params.add(name)
+                    break
+                else:
+                    param = params_dict.get(name)
+                    if param is not None:
+                        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                        weight_loader(param, loaded_weight)
+                        loaded_params.add(name)
+
+        return loaded_params
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/projectors.py b/vllm_omni/model_executor/models/ming_flash_omni/projectors.py
new file mode 100644
index 00000000000..42e53d1c635
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_flash_omni/projectors.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+# Copyright (c) Ant Group. All rights reserved.
+# Adapted from Ming repository modeling_bailingmm2.py
+# https://github.com/inclusionAI/Ming
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+
+logger = init_logger(__name__)
+
+
+class Transpose(nn.Module):
+    """Used in nn.Sequential pipelines."""
+
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.transpose(self.dim0, self.dim1)
+
+
+class VisionProjector(nn.Module):
+    """MLP projector from vision encoder output to LLM hidden space.
+
+    Args:
+        vision_dim: Vision encoder output dimension (out_hidden_size).
+        llm_dim: LLM hidden dimension.
+        mlp_depth: Number of linear layers (>= 1).
+    """
+
+    def __init__(self, vision_dim: int, llm_dim: int, mlp_depth: int = 1):
+        super().__init__()
+        layers: list[nn.Module] = [nn.Linear(vision_dim, llm_dim)]
+        for _ in range(1, mlp_depth):
+            layers.append(nn.GELU())
+            layers.append(nn.Linear(llm_dim, llm_dim))
+        self.proj = nn.Sequential(*layers)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Project vision features.
+
+        Args:
+            x: [seq_len, vision_dim] or [B, seq_len, vision_dim]
+
+        Returns:
+            Projected features with last dim = llm_dim.
+        """
+        return self.proj(x)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if not name.startswith("proj."):
+                name = f"proj.{name}"
+            if name not in params_dict:
+                logger.warning("Skipping unknown vision projector weight: %s", name)
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class AudioProjector(nn.Module):
+    """Projector for audio features.
+
+    Args:
+        audio_dim: Audio encoder output dimension (n_state).
+        llm_dim: LLM hidden dimension.
+        ds_kernel_size: Conv1d kernel size for downsampling.
+        ds_stride: Conv1d stride for downsampling.
+        mlp_depth: Total number of projection layers (>= 1).
+    """
+
+    def __init__(
+        self,
+        audio_dim: int,
+        llm_dim: int,
+        ds_kernel_size: int = 3,
+        ds_stride: int = 2,
+        mlp_depth: int = 1,
+    ):
+        super().__init__()
+        self.ds_kernel_size = ds_kernel_size
+        self.ds_stride = ds_stride
+
+        layers: list[nn.Module] = [
+            nn.Conv1d(
+                audio_dim,
+                llm_dim,
+                kernel_size=ds_kernel_size,
+                stride=ds_stride,
+                padding=ds_kernel_size // 2,
+            ),
+            Transpose(-1, -2),  # [B, llm_dim, T'] -> [B, T', llm_dim]
+        ]
+        for _ in range(1, mlp_depth):
+            layers.append(nn.GELU())
+            layers.append(nn.Linear(llm_dim, llm_dim))
+        self.proj = nn.Sequential(*layers)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Project audio features with temporal downsampling.
+
+        Args:
+            x: [B, T, audio_dim] audio encoder output (channel-last).
+
+        Returns:
+            [B, T', llm_dim] projected features (channel-last),
+            where T' = (T - ds_kernel_size + 2*(ds_kernel_size//2)) // ds_stride + 1.
+        """
+        # Conv1d expects [B, C, T], so transpose input
+        x = x.transpose(-1, -2)  # [B, audio_dim, T]
+        return self.proj(x)
+
+    def forward_packed(
+        self,
+        packed: torch.Tensor,
+        encoded_lens: list[int],
+    ) -> tuple[torch.Tensor, list[int]]:
+        """Project packed audio features from the Whisper encoder.
+
+        Args:
+            packed: [total_T', audio_dim] packed encoder output.
+            encoded_lens: Length of each clip after Whisper encoding.
+
+        Returns:
+            Tuple of:
+                - [total_T'', llm_dim] packed projected features.
+                - List of projected lengths per clip.
+        """
+        conv1d = self.proj[0]
+        mlp = self.proj[2:]
+
+        # Split packed tensor per clip for Conv1d
+        segments = packed.split(encoded_lens)
+        conv_segments = []
+        proj_lens: list[int] = []
+        for seg in segments:
+            out = conv1d(seg.transpose(0, 1).unsqueeze(0))  # [1, llm_dim, T'_i]
+            out = out.squeeze(0).transpose(0, 1)  # [T'_i, llm_dim]
+            conv_segments.append(out)
+            proj_lens.append(out.shape[0])
+
+        packed_proj = torch.cat(conv_segments, dim=0)  # [total_T'', llm_dim]
+        packed_proj = mlp(packed_proj)
+        return packed_proj, proj_lens
+
+    def compute_output_length(self, input_length: torch.Tensor) -> torch.Tensor:
+        """Compute output sequence length after Conv1d downsampling.
+
+        Args:
+            input_length: Original mel spectrogram lengths.
+
+        Returns:
+            Output lengths after both convolutions.
+        """
+        length = (input_length - 3 + 2 * 1) // 2 + 1
+        length = (length - self.ds_kernel_size + 2 * (self.ds_kernel_size // 2)) // self.ds_stride + 1
+        return length
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if not name.startswith("proj."):
+                name = f"proj.{name}"
+            if name not in params_dict:
+                logger.warning("Skipping unknown audio projector weight: %s", name)
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm_omni/model_executor/models/ming_flash_omni/vision_encoder.py b/vllm_omni/model_executor/models/ming_flash_omni/vision_encoder.py
new file mode 100644
index 00000000000..7976d76ce8d
--- /dev/null
+++ b/vllm_omni/model_executor/models/ming_flash_omni/vision_encoder.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from Ming repository qwen3_moe_vit.py
+# https://github.com/inclusionAI/Ming
+
+from collections.abc import Iterable
+
+import torch
+import torch.nn as nn
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.qwen3_omni_moe_thinker import (
+    Qwen3Omni_VisionTransformer,
+)
+from vllm.model_executor.models.utils import WeightsMapper
+
+logger = init_logger(__name__)
+
+
+def _adapt_vision_config(vision_config):
+    # Adapt Ming's Qwen3VLMoeVisionConfig to be compatible with vLLM's
+    # Qwen3Omni_VisionTransformer expectations.
+    if not hasattr(vision_config, "image_size") or vision_config.image_size is None:
+        if hasattr(vision_config, "num_position_embeddings") and vision_config.num_position_embeddings:
+            import math
+
+            num_grid = int(math.sqrt(vision_config.num_position_embeddings))
+            vision_config.image_size = num_grid * vision_config.patch_size
+        else:
+            vision_config.image_size = vision_config.patch_size * 14  # fallback
+
+    if not hasattr(vision_config, "apply_vit_abs_pos_embed"):
+        vision_config.apply_vit_abs_pos_embed = True
+
+    return vision_config
+
+
+class MingVisionEncoder(nn.Module):
+    """**Wrapper** around vLLM's Qwen3Omni_VisionTransformer for Ming."""
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "deepstack_merger_list.": "merger_list.",
+            "merger.norm.": "merger.ln_q.",
+            "merger.linear_fc1.": "merger.mlp.0.",
+            "merger.linear_fc2.": "merger.mlp.2.",
+        }
+    )
+
+    def __init__(
+        self,
+        vision_config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        adapted_config = _adapt_vision_config(vision_config)
+        norm_eps = 1e-6
+        self.encoder = Qwen3Omni_VisionTransformer(
+            vision_config=adapted_config,
+            norm_eps=norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+        )
+        self.image_emb_dim = vision_config.out_hidden_size
+        self.use_deepstack = (
+            hasattr(vision_config, "deepstack_visual_indexes") and vision_config.deepstack_visual_indexes is not None
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.encoder.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.encoder.device
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        """forward method of the vision encoder.
+
+        Args:
+            pixel_values: Flattened pixel values.
+            grid_thw: [num_images, 3] tensor of (t, h, w) grid sizes.
+
+        Returns:
+            If deepstack is enabled, returns concatenated multi-scale features
+            along the feature dim: [seq_len, hidden_size * (1 + num_deepstack)].
+            Otherwise returns [seq_len, hidden_size].
+        """
+        return self.encoder(pixel_values, grid_thw=grid_thw)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        import re
+
+        def _remap_merger_list_inner(name: str) -> str:
+            name = re.sub(r"(merger_list\.\d+)\.norm\.", r"\1.ln_q.", name)
+            name = re.sub(r"(merger_list\.\d+)\.linear_fc1\.", r"\1.mlp.0.", name)
+            name = re.sub(r"(merger_list\.\d+)\.linear_fc2\.", r"\1.mlp.2.", name)
+
+            return name
+
+        remapped_weights = self.hf_to_vllm_mapper.apply(weights)
+        remapped_weights = ((_remap_merger_list_inner(name), tensor) for name, tensor in remapped_weights)
+        loaded_params = self.encoder.load_weights(remapped_weights)
+
+        loaded_params = {f"encoder.{loaded_param}" for loaded_param in loaded_params}
+
+        return loaded_params
diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py
index 92cecbff107..f7e664c74d6 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py
@@ -23,10 +23,11 @@
 import torchaudio.compliance.kaldi as kaldi
 from torch import Tensor
 
+from vllm_omni.model_executor.models.whisper_utils import Conv1d, ConvTranspose1d
 from vllm_omni.utils.audio import mel_filter_bank, peak_normalize
 
 from .core_vq import DistributedGroupResidualVectorQuantization
-from .whisper_encoder import Conv1d, ConvTranspose1d, WhisperEncoder
+from .whisper_encoder import WhisperEncoder
 
 
 def dynamic_range_compression_torch(x, c=1, clip_val=1e-5):
diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py
index 8464f53c9df..7756720b2ba 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py
@@ -23,6 +23,7 @@
 from torch import Tensor, nn
 
 from vllm_omni.diffusion.attention.backends.utils.fa import HAS_FLASH_ATTN, flash_attn_varlen_func
+from vllm_omni.model_executor.models.whisper_utils import Conv1d, Linear, sinusoids
 from vllm_omni.utils.audio import mel_filter_bank
 
 N_FFT = 400
@@ -102,30 +103,6 @@ def get_mel_audio(audio, padding=False, audio_vq_ds_rate=1, n_mels=128):
     return mel
 
 
-def sinusoids(length, channels, max_timescale=10000):
-    """Returns sinusoids for positional embedding"""
-    assert channels % 2 == 0
-    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
-    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
-    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
-    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
-
-
-class Conv1d(nn.Conv1d):
-    def _conv_forward(self, x: Tensor, weight: Tensor, bias: Tensor | None) -> Tensor:
-        return super()._conv_forward(x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype))
-
-
-class ConvTranspose1d(nn.ConvTranspose1d):
-    def _conv_forward(self, x: Tensor, weight: Tensor, bias: Tensor | None) -> Tensor:
-        return super()._conv_forward(x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype))
-
-
-class Linear(nn.Linear):
-    def forward(self, x: Tensor) -> Tensor:
-        return F.linear(x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype))
-
-
 class MultiHeadAttention(nn.Module):
     def __init__(self, n_state: int, n_head: int, use_flash_attention: bool = True):
         super().__init__()
diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py
index 3407b428695..5a466dbd62e 100644
--- a/vllm_omni/model_executor/models/registry.py
+++ b/vllm_omni/model_executor/models/registry.py
@@ -174,6 +174,23 @@
         "dynin_omni",
         "DyninOmniForConditionalGeneration",
     ),
+    ## Ming-flash-omni-2.0
+    "MingFlashOmniForConditionalGeneration": (
+        "ming_flash_omni",
+        "ming_flash_omni",
+        "MingFlashOmniForConditionalGeneration",
+    ),
+    "MingFlashOmniThinkerForConditionalGeneration": (
+        "ming_flash_omni",
+        "ming_flash_omni_thinker",
+        "MingFlashOmniThinkerForConditionalGeneration",
+    ),
+    # Alias: HF repo currently ships this architecture name in config.json
+    "BailingMM2NativeForConditionalGeneration": (
+        "ming_flash_omni",
+        "ming_flash_omni",
+        "MingFlashOmniForConditionalGeneration",
+    ),
 }
 
 
diff --git a/vllm_omni/model_executor/models/whisper_utils.py b/vllm_omni/model_executor/models/whisper_utils.py
new file mode 100644
index 00000000000..5aa2fc8a3ad
--- /dev/null
+++ b/vllm_omni/model_executor/models/whisper_utils.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+# Copyright (c) 2022 OpenAI
+#
+# Shared Whisper encoder primitives used by multiple model implementations.
+# Originally from the OpenAI Whisper codebase.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding."""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+
+
+class Conv1d(nn.Conv1d):
+    """Conv1d with automatic dtype casting for mixed precision inference."""
+
+    def _conv_forward(self, x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None) -> torch.Tensor:
+        return super()._conv_forward(x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype))
+
+
+class ConvTranspose1d(nn.ConvTranspose1d):
+    def _conv_forward(self, x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None) -> torch.Tensor:
+        return super()._conv_forward(x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype))
+
+
+class Linear(nn.Linear):
+    """Linear layer with automatic dtype casting for mixed precision inference."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(x, self.weight.to(x.dtype), None if self.bias is None else self.bias.to(x.dtype))
diff --git a/vllm_omni/model_executor/stage_configs/bailingmm_moe_v2_lite.yaml b/vllm_omni/model_executor/stage_configs/bailingmm_moe_v2_lite.yaml
new file mode 100644
index 00000000000..b7d0aeeb742
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/bailingmm_moe_v2_lite.yaml
@@ -0,0 +1,46 @@
+# Stage config for Ming-flash-omni-2.0
+# Stage 0: Thinker (Multimodal understanding + text generation)
+# Stage 1a: Image Generator (Text embeddings -> PIL image)
+# Stage 1b: Talker (Text embeddings -> audio waveform)
+
+async_chunk: false
+stage_args:
+  - stage_id: 0
+    stage_type: llm
+    runtime:
+      devices: "0,1,2,3"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: MingFlashOmniForConditionalGeneration
+      # tokenizer_subdir: talker/llm
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.9
+      enforce_eager: false
+      trust_remote_code: true
+      engine_output_type: latent
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 4  # Use 4 GPUs for MoE model
+      # pipeline_parallel_size: 4
+      hf_config_name: llm_config
+      compilation_config:
+        pass_config:
+          # there's a version mismatch regarding vllm and flashinfer
+          # disable fuse allreduce for now
+          fuse_allreduce_rms: false
+    final_output: true  # Can output text directly
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      max_tokens: 2048
+      repetition_penalty: 1.05
+      seed: 42
+      detokenize: true
+
+  # Future Stage 1a: Image Generator (Optional - not yet implemented)
+  # Future Stage 1b: Talker/TTS (Optional - not yet implemented)
diff --git a/vllm_omni/transformers_utils/configs/__init__.py b/vllm_omni/transformers_utils/configs/__init__.py
index 0aa3624f802..598ac3a9655 100644
--- a/vllm_omni/transformers_utils/configs/__init__.py
+++ b/vllm_omni/transformers_utils/configs/__init__.py
@@ -19,6 +19,11 @@
     "FishSpeechFastARConfig": "vllm_omni.transformers_utils.configs.fish_speech",
     "VoxCPMConfig": "vllm_omni.transformers_utils.configs.voxcpm",
     "VoxCPM2Config": "vllm_omni.transformers_utils.configs.voxcpm2",
+    "BailingMoeV2Config": "vllm_omni.transformers_utils.configs.ming_flash_omni",
+    "BailingMM2Config": "vllm_omni.transformers_utils.configs.ming_flash_omni",
+    "MingFlashOmniConfig": "vllm_omni.transformers_utils.configs.ming_flash_omni",
+    "Qwen3VLMoeVisionConfig": "vllm_omni.transformers_utils.configs.ming_flash_omni",
+    "WhisperEncoderConfig": "vllm_omni.transformers_utils.configs.ming_flash_omni",
 }
 
 __all__ = [
@@ -31,6 +36,11 @@
     "FishSpeechFastARConfig",
     "VoxCPMConfig",
     "VoxCPM2Config",
+    "BailingMoeV2Config",
+    "BailingMM2Config",
+    "MingFlashOmniConfig",
+    "Qwen3VLMoeVisionConfig",
+    "WhisperEncoderConfig",
 ]
 
 
@@ -51,5 +61,6 @@ def __dir__():
 # run as soon as `vllm_omni.transformers_utils.configs` is imported.
 from vllm_omni.transformers_utils.configs import fish_speech as _fish_speech  # noqa: F401, E402
 from vllm_omni.transformers_utils.configs import mammoth_moda2 as _mammoth_moda2  # noqa: F401, E402
+from vllm_omni.transformers_utils.configs import ming_flash_omni as _ming_flash_omni  # noqa: F401, E402
 from vllm_omni.transformers_utils.configs import voxcpm as _voxcpm  # noqa: F401, E402
 from vllm_omni.transformers_utils.configs import voxcpm2 as _voxcpm2  # noqa: F401, E402
diff --git a/vllm_omni/transformers_utils/configs/ming_flash_omni.py b/vllm_omni/transformers_utils/configs/ming_flash_omni.py
new file mode 100644
index 00000000000..dd13b682dee
--- /dev/null
+++ b/vllm_omni/transformers_utils/configs/ming_flash_omni.py
@@ -0,0 +1,302 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+# Copyright 2024 ANT Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration for Ming-flash-omni-2.0 model"""
+
+import os
+from typing import Any, ClassVar
+
+from transformers import AutoConfig, AutoTokenizer, PretrainedConfig, PreTrainedTokenizerFast
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class BailingMoeV2Config(PretrainedConfig):
+    model_type = "bailing_moe_v2"
+
+    def __init__(
+        self,
+        vocab_size=30592,
+        hidden_size=1024,
+        intermediate_size=None,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_key_value_heads=0,
+        hidden_act="silu",
+        use_qkv_bias=False,
+        use_qk_norm=False,
+        use_bias=True,
+        rms_norm_eps=1e-05,
+        norm_head=False,
+        tie_word_embeddings=False,
+        embedding_dropout=0.0,
+        attention_dropout=0.0,
+        output_dropout=0.0,
+        initializer_range=0.02,
+        max_position_embeddings=16384,
+        rope_theta=10000.0,
+        use_cache=True,
+        use_sliding_window=False,
+        sliding_window=81920,
+        max_window_layers=28,
+        rope_scaling=None,
+        mrope_section=None,
+        pad_token_id=126081,
+        num_experts=16,
+        num_shared_experts=1,
+        num_experts_per_tok=2,
+        n_group=8,
+        topk_group=4,
+        routed_scaling_factor=2.5,
+        moe_intermediate_size=None,
+        first_k_dense_replace=0,
+        head_dim=None,
+        output_router_logits=False,
+        partial_rotary_factor=0.5,
+        router_type="topN",
+        _attn_implementation="flash_attention_2",
+        use_interleaved_frame_timestamp=True,
+        # Multimodal token IDs
+        image_patch_token=157157,
+        video_patch_token=157175,
+        audio_patch_token=157168,
+        image_start_token=157158,
+        video_start_token=157160,
+        audio_start_token=157169,
+        image_end_token=157159,
+        video_end_token=157161,
+        audio_end_token=157170,
+        # Position encoding parameters
+        spatial_merge_size=2,
+        tokens_per_second=2,
+        **kwargs,
+    ):
+        self.num_hidden_layers = num_hidden_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.use_qkv_bias = use_qkv_bias
+        self.use_bias = use_bias
+        self.norm_head = norm_head
+        self.rms_norm_eps = rms_norm_eps
+        self.embedding_dropout = embedding_dropout
+        self.attention_dropout = attention_dropout
+        self.output_dropout = output_dropout
+        self.initializer_range = initializer_range
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.use_cache = use_cache
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
+        self.use_qk_norm = use_qk_norm  # arg unused; QK norm is always applied
+
+        # By default, match the value of `mrope_section`
+        # to `apply_3d_rotary_pos_emb` in Ming's repo:
+        # https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/modeling_bailing_moe_v2.py
+        if mrope_section is None:
+            mrope_section = (rope_scaling or {}).get("mrope_section", [8, 12, 12])
+        # Ensure mrope_section is stored inside rope_scaling
+        if rope_scaling is not None and isinstance(rope_scaling, dict):
+            rope_scaling = dict(rope_scaling)
+            rope_scaling.setdefault("mrope_section", mrope_section)
+        self.rope_scaling = rope_scaling
+
+        # NOTE: Expose rope_parameters["mrope_section"]
+        # This refers to the pattern used for GLM-Image in vllm_omni/patch.py
+        rope_type = (rope_scaling or {}).get("type", (rope_scaling or {}).get("rope_type", ""))
+        if rope_type in ("video_rope", "3D", "mrope"):
+            self.rope_parameters = {"mrope_section": mrope_section}
+        else:
+            self.rope_parameters = None
+
+        # MoE configs
+        self.num_experts = num_experts
+        self.num_shared_experts = num_shared_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.moe_intermediate_size = moe_intermediate_size
+        self.first_k_dense_replace = first_k_dense_replace
+        self.output_router_logits = output_router_logits
+        self.routed_scaling_factor = routed_scaling_factor
+        self.partial_rotary_factor = partial_rotary_factor
+        self.router_type = router_type
+        self.use_interleaved_frame_timestamp = use_interleaved_frame_timestamp
+        self._attn_implementation = _attn_implementation
+
+        # Multimodal token IDs and position encoding
+        self.image_patch_token = image_patch_token
+        self.video_patch_token = video_patch_token
+        self.audio_patch_token = audio_patch_token
+        self.image_start_token = image_start_token
+        self.video_start_token = video_start_token
+        self.audio_start_token = audio_start_token
+        self.image_end_token = image_end_token
+        self.video_end_token = video_end_token
+        self.audio_end_token = audio_end_token
+        self.spatial_merge_size = spatial_merge_size
+        self.tokens_per_second = tokens_per_second
+
+        super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class Qwen3VLMoeVisionConfig(PretrainedConfig):
+    """Configuration class for Qwen3 MoE Vision Transformer"""
+
+    model_type = "qwen3_moe_vit"
+
+    def __init__(
+        self,
+        depth=27,
+        hidden_size=1152,
+        hidden_act="gelu_pytorch_tanh",
+        intermediate_size=4304,
+        num_heads=16,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=3584,
+        num_position_embeddings=2304,
+        deepstack_visual_indexes=[8, 16, 24],
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.num_position_embeddings = num_position_embeddings
+        self.initializer_range = initializer_range
+        self.deepstack_visual_indexes = deepstack_visual_indexes
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str | os.PathLike, **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if "vision_config" in config_dict:
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class WhisperEncoderConfig(PretrainedConfig):
+    """Configuration class for Whisper audio encoder"""
+
+    model_type = "whisper_encoder"
+
+    def __init__(
+        self,
+        whisper_encoder_config: dict[str, Any] | None = None,
+        ds_kernel_size=3,
+        ds_stride=2,
+        norm_query_embeds=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.whisper_encoder_config = whisper_encoder_config or {}
+        self.ds_kernel_size = ds_kernel_size
+        self.ds_stride = ds_stride
+        self.norm_query_embeds = norm_query_embeds
+
+
+class BailingMM2Config(PretrainedConfig):
+    model_type = "bailingmm_moe_v2_lite"
+    is_composition = True
+    sub_configs: ClassVar = {"llm_config": AutoConfig}
+
+    def __init__(
+        self,
+        mlp_depth=1,
+        llm_config: BailingMoeV2Config | None = None,
+        vision_config: Qwen3VLMoeVisionConfig | None = None,
+        audio_config: WhisperEncoderConfig | None = None,
+        **kwargs,
+    ):
+        self.audio_config = WhisperEncoderConfig(**audio_config) if isinstance(audio_config, dict) else audio_config
+        self.vision_config = (
+            Qwen3VLMoeVisionConfig(**vision_config) if isinstance(vision_config, dict) else vision_config
+        )
+        self.llm_config = BailingMoeV2Config(**llm_config) if isinstance(llm_config, dict) else llm_config
+        self.mlp_depth = mlp_depth
+        super().__init__(**kwargs)
+
+    def get_text_config(self, decoder: bool = False) -> PretrainedConfig:  # noqa: ARG002
+        return self.llm_config
+
+
+class MingFlashOmniConfig(PretrainedConfig):
+    """Configuration class for unified Ming-flash-omni-2.0 model"""
+
+    model_type = "ming_flash_omni"
+    is_composition = True
+    sub_configs: ClassVar = {"thinker_config": BailingMM2Config}
+
+    def __init__(
+        self,
+        thinker_config: BailingMM2Config | None = None,
+        image_gen_config: dict[str, Any] | None = None,
+        talker_config: dict[str, Any] | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if isinstance(thinker_config, dict):
+            self.thinker_config = BailingMM2Config(**thinker_config)
+        else:
+            self.thinker_config = thinker_config or BailingMM2Config()
+
+        # Image generation config (for future implementation)
+        self.image_gen_config = image_gen_config
+
+        # Talker config (for future implementation)
+        self.talker_config = talker_config
+
+    def get_text_config(self, decoder: bool = False) -> PretrainedConfig:  # noqa: ARG002
+        return self.thinker_config.get_text_config()
+
+
+# Register model_type -> config class for AutoConfig
+AutoConfig.register(BailingMoeV2Config.model_type, BailingMoeV2Config)
+AutoConfig.register(BailingMM2Config.model_type, BailingMM2Config)
+AutoConfig.register(MingFlashOmniConfig.model_type, MingFlashOmniConfig)
+
+# Register tokenizer mapping for composition configs so that
+# AutoTokenizer.from_pretrained can resolve the tokenizer class
+AutoTokenizer.register(BailingMM2Config, fast_tokenizer_class=PreTrainedTokenizerFast)
+AutoTokenizer.register(MingFlashOmniConfig, fast_tokenizer_class=PreTrainedTokenizerFast)
diff --git a/vllm_omni/transformers_utils/processors/__init__.py b/vllm_omni/transformers_utils/processors/__init__.py
new file mode 100644
index 00000000000..52ca6575397
--- /dev/null
+++ b/vllm_omni/transformers_utils/processors/__init__.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+
+from vllm_omni.transformers_utils.processors.ming import (
+    MingFlashOmniProcessor,
+    MingWhisperFeatureExtractor,
+)
+
+__all__ = [
+    "MingFlashOmniProcessor",
+    "MingWhisperFeatureExtractor",
+]
diff --git a/vllm_omni/transformers_utils/processors/ming.py b/vllm_omni/transformers_utils/processors/ming.py
new file mode 100644
index 00000000000..7f414b7268c
--- /dev/null
+++ b/vllm_omni/transformers_utils/processors/ming.py
@@ -0,0 +1,430 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM-Omni team.
+# Copyright 2024 ANT Group and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import numpy as np
+import torch
+from transformers import AutoFeatureExtractor, AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+DEFAULT_IMAGE_PATCH_TOKEN = "<imagePatch>"
+DEFAULT_IM_START_TOKEN = "<image>"
+DEFAULT_IM_END_TOKEN = "</image>"
+DEFAULT_VID_START_TOKEN = "<video>"
+DEFAULT_VID_END_TOKEN = "</video>"
+DEFAULT_FRAME_PATCH_TOKEN = "<framePatch>"
+
+DEFAULT_AUDIO_PATCH_TOKEN = "<audioPatch>"
+DEFAULT_AU_START_TOKEN = "<audio>"
+DEFAULT_AU_END_TOKEN = "</audio>"
+
+# High-level placeholders used in user prompts
+PLACEHOLDER_IMAGE_TOKEN_IN_TEXT = "<IMAGE>"
+PLACEHOLDER_VIDEO_TOKEN_IN_TEXT = "<VIDEO>"
+PLACEHOLDER_AUDIO_TOKEN_IN_TEXT = "<AUDIO>"
+
+# Chat template constants
+USER_PREFIX = "<role>HUMAN</role>"
+ASSISTANT_PREFIX = "<role>ASSISTANT</role>"
+SYSTEM_PROMPT_NOTHINK = "<role>SYSTEM</role>你是一个友好的AI助手。\n\ndetailed thinking off"
+SYSTEM_PROMPT_THINK = "<role>SYSTEM</role>你是一个友好的AI助手。\n\ndetailed thinking on"
+
+
+_NORM_FACTOR_FOR_DTYPE = {
+    torch.int8: 2**7,
+    torch.int16: 2**15,
+    torch.int32: 2**31,
+    torch.int64: 2**63,
+    torch.float32: 1,
+    torch.float64: 1,
+}
+
+
+def _normalize_audio_tensor(
+    waveform: torch.Tensor,
+    sample_rate: int,
+    target_sample_rate: int = 16000,
+) -> torch.Tensor:
+    """Normalize waveform to float32, mono, and optionally resample."""
+    norm_factor = _NORM_FACTOR_FOR_DTYPE.get(waveform.dtype, 1)
+    waveform = waveform.to(torch.float32) / norm_factor
+
+    # Remove channel dimension
+    while len(waveform.shape) > 1:
+        waveform = waveform[0]
+
+    # Resample if needed
+    if sample_rate != target_sample_rate:
+        import torchaudio
+
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+        waveform = resampler(waveform.unsqueeze(0)).squeeze(0)
+
+    return waveform
+
+
+class MingWhisperFeatureExtractor(FeatureExtractionMixin):
+    """Whisper log-mel feature extractor for Ming-flash-omni-2.0.
+
+    Produces audio_feats in the time-first packed format.
+
+    Adapted from Ming's WhisperAudioEncoder
+    https://github.com/inclusionAI/Ming/blob/070dc3c13f95d97952ab7d22030df0c9e28a5122/modeling_whisper_encoder.py
+    and HF transformers WhisperFeatureExtractor
+    https://github.com/huggingface/transformers/blob/f842abaca95a7dbf3fc6e16122e7409109bc1431/src/transformers/models/whisper/feature_extraction_whisper.py#L33
+    """
+
+    model_input_names = ["audio_feats", "audio_feats_lengths"]
+
+    def __init__(self, feature_size: int = 128, sampling_rate: int = 16000, **kwargs):
+        # feature_size == n_mels; stored so to_dict() serialises it correctly.
+        self.feature_size = feature_size
+        self.sampling_rate = sampling_rate
+        super().__init__(**kwargs)
+
+    @property
+    def n_mels(self) -> int:
+        return self.feature_size
+
+    def __call__(
+        self,
+        audios: tuple | list,
+        return_tensors: str | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """Preprocess audio(s) into Whisper log-mel spectrograms"""
+        import whisper
+
+        if not isinstance(audios, list):
+            audios = [audios]
+
+        audio_feat_list = []
+        for waveform, sr in audios:
+            if isinstance(waveform, np.ndarray):
+                waveform = torch.from_numpy(waveform)
+            waveform = _normalize_audio_tensor(waveform, sr, target_sample_rate=self.sampling_rate)
+            mel = whisper.log_mel_spectrogram(waveform, n_mels=self.n_mels)
+            audio_feat_list.append(mel.transpose(0, 1))  # [T, n_mels]
+
+        audio_feats_lengths = torch.tensor([[feat.shape[0] for feat in audio_feat_list]], dtype=torch.long)
+        # Two stride-2 convolutions in series:
+        #   1. WhisperAudioEncoder conv2: kernel=3, stride=2, padding=1
+        #      (conv1 has stride=1 and does not change T)
+        #   2. AudioProjector Conv1d: kernel=3, stride=2, padding=1
+        # Combined: T → ((T-1)//2 + 1 - 1)//2 + 1
+        # See also: AudioProjector.compute_output_length()
+        encoder_feats_lengths = ((audio_feats_lengths - 3 + 2 * 1) // 2 + 1 - 3 + 2 * 1) // 2 + 1
+        audio_feats = torch.cat(audio_feat_list, dim=0).unsqueeze(0)  # [1, T_total, n_mels]
+
+        data = {
+            # [1, T_total, n_mels], all audio clips concatenated
+            "audio_feats": audio_feats.numpy(),
+            # [1, n_audios], actual frame count
+            "audio_feats_lengths": audio_feats_lengths.numpy(),
+            # [1, n_audios]
+            "encoder_feats_lengths": encoder_feats_lengths,
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+class MingFlashOmniProcessor(ProcessorMixin):
+    """Top-level multimodal processor for Ming-flash-omni 2.0.
+
+    Adapted from Ming's BailingMM2Processor
+    https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/processing_bailingmm2.py
+
+    Subprocessors include:
+    - Qwen2VLImageProcessor (image/video)
+    - MingWhisperFeatureExtractor (modified audio processor using Whisper's log-mel spectrogram)
+    """
+
+    attributes = ["image_processor", "audio_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    audio_processor_class = "AutoFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        audio_processor=None,
+        tokenizer=None,
+        merge_size: int = 2,
+        **kwargs,
+    ):
+        # Enforce that all sub-processors exist
+        # Keep None defaults in the signature for HF ProcessorMixin compatibility
+        if image_processor is None:
+            raise ValueError("MingFlashOmniProcessor requires `image_processor`.")
+        if audio_processor is None:
+            raise ValueError("MingFlashOmniProcessor requires `audio_processor`.")
+        if tokenizer is None:
+            raise ValueError("MingFlashOmniProcessor requires `tokenizer`.")
+
+        self.spatial_merge_size = merge_size
+        self.image_token = PLACEHOLDER_IMAGE_TOKEN_IN_TEXT
+        self.video_token = PLACEHOLDER_VIDEO_TOKEN_IN_TEXT
+        self.audio_token = PLACEHOLDER_AUDIO_TOKEN_IN_TEXT
+        super().__init__(
+            image_processor=image_processor,
+            audio_processor=audio_processor,
+            tokenizer=tokenizer,
+        )
+
+        # Fall back to the tokenizer's own chat_template.
+        if self.chat_template is None:
+            self.chat_template = getattr(tokenizer, "chat_template", None)
+
+    def __call__(
+        self,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput],
+        images: Any | None = None,
+        videos: Any | None = None,
+        audios: tuple[np.ndarray, int] | list[tuple[np.ndarray, int]] | None = None,
+        **kwargs,
+    ) -> BatchFeature:
+        # This should always be parallel implementations that mirror
+        # `_get_prompt_updates` logic in Ming processor, and vice versa.
+        # Ensure text is a list
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list):
+            raise ValueError("text must be a string or list of strings")
+
+        data: dict[str, Any] = {}
+
+        if images is not None:
+            image_outputs = self.image_processor(
+                images=images,
+                videos=None,
+                return_tensors="pt",
+                **kwargs.get("images_kwargs", {}),
+            )
+            data.update(image_outputs)
+            if "image_grid_thw" in image_outputs:
+                text = self._expand_image_tokens(text, image_outputs["image_grid_thw"])
+
+        if videos is not None:
+            video_outputs = self.image_processor(
+                images=None,
+                videos=videos,
+                return_tensors="pt",
+                **kwargs.get("videos_kwargs", {}),
+            )
+            if "pixel_values" in video_outputs:
+                video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values")
+            if "image_grid_thw" in video_outputs:
+                video_outputs["video_grid_thw"] = video_outputs.pop("image_grid_thw")
+            data.update(video_outputs)
+            if "video_grid_thw" in video_outputs:
+                text = self._expand_video_tokens(text, video_outputs["video_grid_thw"])
+
+        if audios is not None:
+            audio_outputs = self.audio_processor(
+                audios,
+                return_tensors="pt",
+                **kwargs.get("audio_kwargs", {}),
+            )
+            data.update(audio_outputs)
+            if "encoder_feats_lengths" in audio_outputs:
+                text = self._expand_audio_tokens(text, audio_outputs["encoder_feats_lengths"])
+
+        text_outputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            **kwargs.get("text_kwargs", {}),
+        )
+        data.update(text_outputs)
+        return BatchFeature(data=data)
+
+    def _expand_image_tokens(
+        self,
+        text: list[str],
+        image_grid_thw: torch.Tensor,
+        special_token: str = PLACEHOLDER_IMAGE_TOKEN_IN_TEXT,
+    ) -> list[str]:
+        merge_size = self.spatial_merge_size
+        num_patches_per_image = torch.prod(image_grid_thw, dim=1) // (merge_size**2)
+        prompt_strings = []
+        image_index = 0
+        for sample in text:
+            num_images = sample.count(special_token)
+            if num_images > 0:
+                for i in range(image_index, num_images + image_index):
+                    num_patches = int(num_patches_per_image[i].item())
+                    img_text = (
+                        DEFAULT_IM_START_TOKEN + (DEFAULT_IMAGE_PATCH_TOKEN * num_patches) + DEFAULT_IM_END_TOKEN + "\n"
+                    )
+                    sample = sample.replace(special_token, img_text, 1)
+            image_index += num_images
+            prompt_strings.append(sample)
+        return prompt_strings
+
+    def _expand_video_tokens(
+        self,
+        text: list[str],
+        video_grid_thw: torch.Tensor,
+        special_token: str = PLACEHOLDER_VIDEO_TOKEN_IN_TEXT,
+    ) -> list[str]:
+        merge_size = self.spatial_merge_size
+        num_patches_per_video = torch.prod(video_grid_thw, dim=1) // (merge_size**2)
+        prompt_strings = []
+        video_index = 0
+        for sample in text:
+            num_videos = sample.count(special_token)
+            if num_videos > 0:
+                for i in range(video_index, num_videos + video_index):
+                    num_patches = int(num_patches_per_video[i].item())
+                    video_text = (
+                        DEFAULT_VID_START_TOKEN
+                        + (DEFAULT_FRAME_PATCH_TOKEN * num_patches)
+                        + DEFAULT_VID_END_TOKEN
+                        + "\n"
+                    )
+                    sample = sample.replace(special_token, video_text, 1)
+            video_index += num_videos
+            prompt_strings.append(sample)
+        return prompt_strings
+
+    def _expand_audio_tokens(
+        self,
+        text: list[str],
+        encoder_feats_lengths: torch.Tensor,
+        special_token: str = PLACEHOLDER_AUDIO_TOKEN_IN_TEXT,
+    ) -> list[str]:
+        prompt_strings = []
+        for sample, lengths_tensor in zip(text, encoder_feats_lengths):
+            for length in lengths_tensor:
+                num_patches = int(length.item())
+                if num_patches == 0:
+                    continue
+                audio_text = DEFAULT_AU_START_TOKEN + (DEFAULT_AUDIO_PATCH_TOKEN * num_patches) + DEFAULT_AU_END_TOKEN
+                if special_token in sample:
+                    sample = sample.replace(special_token, audio_text, 1)
+                else:
+                    sample = sample + audio_text + "\n"
+            prompt_strings.append(sample)
+        return prompt_strings
+
+    def apply_system_template(
+        self,
+        sys_prompt_exp: str | None = None,
+        use_cot_system_prompt: bool = False,
+    ) -> str:
+        sys_prompt = SYSTEM_PROMPT_THINK if use_cot_system_prompt else SYSTEM_PROMPT_NOTHINK
+        if sys_prompt_exp is not None:
+            sys_prompt = sys_prompt.replace("你是一个友好的AI助手。", sys_prompt_exp)
+        return sys_prompt
+
+    def apply_chat_template(
+        self,
+        conversation: list[dict[str, Any]],
+        sys_prompt_exp: str | None = None,
+        use_cot_system_prompt: bool = False,
+        **kwargs,
+    ) -> str:
+        eos = self.tokenizer.eos_token
+        text = self.apply_system_template(sys_prompt_exp, use_cot_system_prompt) + eos
+
+        for idx, message in enumerate(conversation):
+            assert message["role"] in ["HUMAN", "ASSISTANT"], (
+                f"Invalid role: {message['role']}. Must be 'HUMAN' or 'ASSISTANT'"
+            )
+            if idx == len(conversation) - 1:
+                assert message["role"] == "HUMAN", "Last message must be from HUMAN"
+
+            text += USER_PREFIX if message["role"] == "HUMAN" else ASSISTANT_PREFIX
+
+            content = message["content"]
+            if isinstance(content, str):
+                # text-only
+                text += content
+            elif isinstance(content, list):
+                # structured content with multimodal elements
+                # Count existing placeholders from text items only
+                image_placeholders = 0
+                video_placeholders = 0
+                audio_placeholders = 0
+                for content_item in content:
+                    if content_item.get("type", "text") == "text":
+                        t = content_item.get("text", "")
+                        image_placeholders += t.count(PLACEHOLDER_IMAGE_TOKEN_IN_TEXT)
+                        video_placeholders += t.count(PLACEHOLDER_VIDEO_TOKEN_IN_TEXT)
+                        audio_placeholders += t.count(PLACEHOLDER_AUDIO_TOKEN_IN_TEXT)
+
+                if video_placeholders > 1:
+                    raise ValueError("Video count must be at most 1 per message!")
+
+                # Insert placeholders only for media items not already covered
+                for content_item in content:
+                    content_type = content_item.get("type", "text")
+
+                    if content_type == "image":
+                        image_data = content_item.get("image")
+                        if image_data is not None:
+                            from PIL import Image as PILImage
+
+                            num_images = 1 if isinstance(image_data, (str, PILImage.Image)) else len(image_data)
+                            for _ in range(num_images):
+                                if image_placeholders > 0:
+                                    image_placeholders -= 1
+                                else:
+                                    text += PLACEHOLDER_IMAGE_TOKEN_IN_TEXT
+
+                    elif content_type == "video":
+                        if video_placeholders > 0:
+                            video_placeholders -= 1
+                        else:
+                            text += PLACEHOLDER_VIDEO_TOKEN_IN_TEXT
+                    elif content_type == "audio":
+                        audio_data = content_item.get("audio")
+                        if audio_data is not None:
+                            num_audios = 1 if isinstance(audio_data, str) else len(audio_data)
+                            for _ in range(num_audios):
+                                if audio_placeholders > 0:
+                                    audio_placeholders -= 1
+                                else:
+                                    text += PLACEHOLDER_AUDIO_TOKEN_IN_TEXT
+
+                    elif content_type == "text":
+                        text += content_item.get("text", "")
+
+            # Add EOS token after each message except the last one
+            text += eos
+
+        text += ASSISTANT_PREFIX
+        return text
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        names = (
+            self.tokenizer.model_input_names
+            + self.image_processor.model_input_names
+            + self.audio_processor.model_input_names
+        )
+        return list(dict.fromkeys(names))
+
+
+AutoFeatureExtractor.register("MingWhisperFeatureExtractor", MingWhisperFeatureExtractor)
+AutoProcessor.register("MingFlashOmniProcessor", MingFlashOmniProcessor)