vllm-project · tzhouam · Jan 19, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 9, 2026
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -94,7 +94,7 @@ steps:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
   - label: "Diffusion Parallelism Test"
-    timeout_in_minutes: 20
+    timeout_in_minutes: 25
     depends_on: image-build
     commands:
       - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
@@ -116,7 +116,7 @@ steps:
     timeout_in_minutes: 20
     depends_on: image-build
     commands:
-      - pytest -s -v tests/diffusion/test_gpu_worker.py
+      - pytest -s -v tests/diffusion/test_gpu_diffusion_worker.py
     agents:
       queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
     plugins:

diff --git a/.buildkite/scripts/simple_test.sh b/.buildkite/scripts/simple_test.sh
@@ -52,3 +52,4 @@ VENV_PYTHON="${VENV_DIR}/bin/python"
 "${VENV_PYTHON}" -m pytest -v -s tests/entrypoints/
 "${VENV_PYTHON}" -m pytest -v -s tests/diffusion/cache/
 "${VENV_PYTHON}" -m pytest -v -s tests/model_executor/models/qwen2_5_omni/test_audio_length.py
+"${VENV_PYTHON}" -m pytest -v -s tests/worker/
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -54,7 +54,7 @@ steps:
   commands:
     - export MIOPEN_DEBUG_CONV_DIRECT=0
     - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pytest -s -v tests/diffusion/test_gpu_worker.py
+    - pytest -s -v tests/diffusion/test_gpu_diffusion_worker.py
 
 - label: "Omni Model Test Qwen2-5-Omni"
   timeout_in_minutes: 15

diff --git a/README.md b/README.md
@@ -9,13 +9,13 @@ Easy, fast, and cheap omni-modality model serving for everyone
 </h3>
 
 <p align="center">
-| <a href="https://vllm-omni.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://vllm-omni.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | <a href="docs/assets/WeChat.jpg"><b>WeChat</b></a> |
 </p>
 
 ---
 
 *Latest News* 🔥
-
+- [2026/01] We released [0.14.0rc1](https://github.com/vllm-project/vllm-omni/releases/tag/v0.14.0rc2).
 - [2026/01] We released [0.12.0rc1](https://github.com/vllm-project/vllm-omni/releases/tag/v0.12.0rc1) - a major RC milestone focused on maturing the diffusion stack, strengthening OpenAI-compatible serving, expanding omni-model coverage, and improving stability across platforms (GPU/NPU/ROCm), please check our latest [design](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true).
 - [2025/11] vLLM community officially released [vllm-project/vllm-omni](https://github.com/vllm-project/vllm-omni) in order to support omni-modality models serving.
 
@@ -70,6 +70,10 @@ Please check out [Contributing to vLLM-Omni](https://vllm-omni.readthedocs.io/en
 ## Join the Community
 Feel free to ask questions, provide feedbacks and discuss with fellow users of vLLM-Omni in `#sig-omni` slack channel at [slack.vllm.ai](https://slack.vllm.ai) or vLLM user forum at [discuss.vllm.ai](https://discuss.vllm.ai).
 
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=vllm-project/vllm-omni&type=date&legend=top-left)](https://www.star-history.com/#vllm-project/vllm-omni&type=date&legend=top-left)
+
 ## License
 
 Apache License 2.0, as found in the [LICENSE](./LICENSE) file.
diff --git a/collect_env.py b/collect_env.py
@@ -57,6 +57,7 @@
         "cpu_info",
         "rocm_version",  # vllm specific field
         "vllm_version",  # vllm specific field
+        "vllm_omni_version",  # vllm-omni specific field
         "vllm_build_flags",  # vllm specific field
         "gpu_topo",  # vllm specific field
         "env_vars",
@@ -289,6 +290,31 @@ def get_vllm_version():
     return __version__
 
 
+def get_vllm_omni_version(run_lambda):
+    try:
+        import vllm_omni
+        from vllm_omni import __version__, __version_tuple__
+
+        version_str = __version_tuple__[-1]
+        if isinstance(version_str, str) and version_str.startswith("g"):
+            if "." in version_str:
+                git_sha = version_str.split(".")[0][1:]
+                date = version_str.split(".")[-1][1:]
+                return f"{__version__} (git sha: {git_sha}, date: {date})"
+            else:
+                git_sha = version_str[1:]
+                return f"{__version__} (git sha: {git_sha})"
+
+        package_dir = os.path.dirname(os.path.abspath(vllm_omni.__file__))
+        git_sha = run_and_read_all(run_lambda, f"git -C {package_dir} rev-parse --short HEAD")
+        if git_sha:
+            return f"{__version__} (git sha: {git_sha})"
+
+        return __version__
+    except ImportError:
+        return "N/A (vllm_omni not installed)"
+
+
 def summarize_vllm_build_flags():
     # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
     return "CUDA Archs: {}; ROCm: {}".format(
@@ -524,6 +550,7 @@ def get_version_or_na(cfg, prefix):
 
     rocm_version = get_rocm_version(run_lambda)
     vllm_version = get_vllm_version()
+    vllm_omni_version = get_vllm_omni_version(run_lambda)
     vllm_build_flags = summarize_vllm_build_flags()
     gpu_topo = get_gpu_topo(run_lambda)
 
@@ -555,6 +582,7 @@ def get_version_or_na(cfg, prefix):
         cpu_info=get_cpu_info(run_lambda),
         rocm_version=rocm_version,
         vllm_version=vllm_version,
+        vllm_omni_version=vllm_omni_version,
         vllm_build_flags=vllm_build_flags,
         gpu_topo=gpu_topo,
         env_vars=get_env_vars(),
@@ -621,6 +649,7 @@ def get_version_or_na(cfg, prefix):
 ==============================
 ROCM Version                 : {rocm_version}
 vLLM Version                 : {vllm_version}
+vLLM-Omni Version            : {vllm_omni_version}
 vLLM Build Flags:
   {vllm_build_flags}
 GPU Topology:

diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
@@ -1,11 +1,17 @@
 ARG VLLM_BASE_IMAGE=vllm/vllm-openai
-ARG VLLM_BASE_TAG=v0.12.0
+ARG VLLM_BASE_TAG=v0.14.0rc2
 FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
 ARG APP_DIR=/workspace/vllm-omni
 WORKDIR ${APP_DIR}
 
 COPY . .
 
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 # Install vllm-omni into the same uv-managed Python environment used by the base image.
 RUN uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
 

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -2,7 +2,7 @@ ARG BASE_IMAGE=rocm/vllm-dev:nightly_main_20251205
 FROM ${BASE_IMAGE}
 
 ARG COMMON_WORKDIR=/app
-ARG VLLM_VERSION=v0.12.0
+ARG VLLM_VERSION=v0.14.0rc2
 ARG PYTORCH_ROCM_ARCH="gfx942;gfx950"
 
 WORKDIR ${COMMON_WORKDIR}

diff --git a/docs/.nav.yml b/docs/.nav.yml
@@ -3,7 +3,7 @@ nav:
 - User Guide:
   - Getting Started:
     - getting_started/quickstart.md
-    - getting_started/installation
+    - getting_started/installation/*
   - Serving:
     - OpenAI-Compatible API:
       - Image Generation: serving/image_generation_api.md
@@ -26,17 +26,16 @@ nav:
   - Configuration:
     - configuration/README.md
     - configuration/*
-  - Diffusion Acceleration:
-    - Overview: user_guide/diffusion_acceleration.md
-    - Acceleration Methods:
-      - TeaCache: user_guide/acceleration/teacache.md
-      - Cache-DiT: user_guide/acceleration/cache_dit_acceleration.md
-      - Parallelism Acceleration: user_guide/acceleration/parallelism_acceleration.md
   - Models:
     - models/supported_models.md
   - Features:
     - Sleep Mode: features/sleep_mode.md
-    - CPU Offloading for Diffusion Model: features/cpu_offload_diffusion.md
+    - Diffusion Features:
+      - Overview: user_guide/diffusion_acceleration.md
+      - TeaCache: user_guide/diffusion/teacache.md
+      - Cache-DiT: user_guide/diffusion/cache_dit_acceleration.md
+      - Parallelism Acceleration: user_guide/diffusion/parallelism_acceleration.md
+      - CPU Offloading: user_guide/diffusion/cpu_offload_diffusion.md
 - Developer Guide:
   - General:
     - contributing/README.md
@@ -47,7 +46,6 @@ nav:
     - contributing/model/adding_omni_model.md
     - contributing/model/adding_diffusion_model.md
   - CI: contributing/ci
-  - Tests: contributing/tests
   - Design Documents:
     - design/index.md
     - design/architecture_overview.md

diff --git a/docs/api/README.md b/docs/api/README.md
@@ -82,6 +82,7 @@ Model execution components.
 - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniMoeTalkerCodePredictor][]
 - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeModel][]
 - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeTalkerForConditionalGeneration][]
+- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeTalkerSharedExpertWrapper][]
 - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMForCausalLM][]
 - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMModel][]
 - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeConditionalGenerationMixin][]
@@ -102,8 +103,9 @@ Configuration classes.
 
 Worker classes and model runners for distributed inference.
 
-- [vllm_omni.diffusion.worker.gpu_worker.GPUWorker][]
-- [vllm_omni.diffusion.worker.gpu_worker.WorkerProc][]
+- [vllm_omni.diffusion.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][]
+- [vllm_omni.diffusion.worker.gpu_diffusion_worker.GPUDiffusionWorker][]
+- [vllm_omni.diffusion.worker.gpu_diffusion_worker.WorkerProc][]
 - [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorker][]
 - [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorkerProc][]
 - [vllm_omni.worker.gpu_ar_model_runner.ExecuteModelState][]

diff --git a/docs/assets/WeChat.jpg b/docs/assets/WeChat.jpg
diff --git a/docs/configuration/README.md b/docs/configuration/README.md
@@ -2,7 +2,7 @@
 
 This section lists the most common options for running vLLM-Omni.
 
-For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.12.0/configuration/index.html)
+For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.14.0/configuration/index.html)
 
 Currently, the main options are maintained by stage configs for each model.
 
@@ -16,6 +16,6 @@ For introduction, please check [Introduction for stage config](./stage_configs.m
 
 ## Optimization Features
 
-- **[TeaCache Configuration](../user_guide/acceleration/teacache.md)** - Enable TeaCache adaptive caching for DiT models to achieve 1.5x-2.0x speedup with minimal quality loss
-- **[Cache-DiT Configuration](../user_guide/acceleration/cache_dit_acceleration.md)** - Enable Cache-DiT as cache acceleration backends for DiT models
-- **[Parallelism Configuration](../user_guide/acceleration/parallelism_acceleration.md)** - Enable parallelism (e.g., sequence parallelism) for for DiT models
+- **[TeaCache Configuration](../user_guide/diffusion/teacache.md)** - Enable TeaCache adaptive caching for DiT models to achieve 1.5x-2.0x speedup with minimal quality loss
+- **[Cache-DiT Configuration](../user_guide/diffusion/cache_dit_acceleration.md)** - Enable Cache-DiT as cache acceleration backends for DiT models
+- **[Parallelism Configuration](../user_guide/diffusion/parallelism_acceleration.md)** - Enable parallelism (e.g., sequence parallelism) for for DiT models
diff --git a/docs/contributing/ci/tests_markers.md b/docs/contributing/ci/tests_markers.md
@@ -0,0 +1,160 @@
+# Markers for Tests
+
+By adding markers before test functions, tests can later be executed uniformly by simply declaring the corresponding marker type.
+
+## Current Markers
+Defined in `pyproject.toml`:
+
+| Marker             | Description                                             |
+| ------------------ | ------------------------------------------------------- |
+| `core_model`       | Core model tests (run in each PR)                       |
+| `diffusion`        | Diffusion model tests                                   |
+| `omni`             | Omni model tests                                        |
+| `cache`            | Cache backend tests                                     |
+| `parallel`         | Parallelism/distributed tests                           |
+| `cpu`              | Tests that run on CPU                                   |
+| `gpu`              | Tests that run on GPU (auto-added)                      |
+| `cuda`             | Tests that run on CUDA (auto-added)                     |
+| `rocm`             | Tests that run on AMD/ROCm (auto-added)                 |
+| `npu`              | Tests that run on NPU/Ascend (auto-added)               |
+| `H100`             | Tests that require H100 GPU                             |
+| `L4`               | Tests that require L4 GPU                               |
+| `MI325`            | Tests that require MI325 GPU (AMD/ROCm)                 |
+| `A2`               | Tests that require A2 NPU                               |
+| `A3`               | Tests that require A3 NPU                               |
+| `distributed_cuda` | Tests that require multi cards on CUDA platform         |
+| `distributed_rocm` | Tests that require multi cards on ROCm platform         |
+| `distributed_npu`  | Tests that require multi cards on NPU platform          |
+| `skipif_cuda`      | Skip if the num of CUDA cards is less than the required |
+| `skipif_rocm`      | Skip if the num of ROCm cards is less than the required |
+| `skipif_npu`       | Skip if the num of NPU cards is less than the required  |
+| `slow`             | Slow tests (may skip in quick CI)                       |
+| `benchmark`        | Benchmark tests                                         |
+
+For those markers shown as auto-added, they will be added by the `@hardware_test` decorator.
+
+### Example usage for markers
+
+```python
+from tests.utils import hardware_test
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(
+   res={"cuda": "L4", "rocm": "MI325", "npu": "A2"},
+   num_cards=2,
+)
+@pytest.mark.parametrize("omni_server", test_params, indirect=True)
+def test_video_to_audio()
+    ...
+```
+### Decorator: `@hardware_test`
+
+This decorator is intended to make hardware-aware, cross-platform test authoring easier and more robust for CI/CD environments. The `hardware_test` decorator in `vllm-omni/tests/utils.py` performs the following actions:
+
+1. **Applies platform and resource markers**  
+   Adds the appropriate pytest markers for each specified hardware platform (e.g., `cuda`, `rocm`, `npu`) and resource type (e.g., `L4`, `H100`, `MI325`, `A2`, `A3`).
+   ```
+   @pytest.mark.cuda
+   @pytest.mark.L4
+   ```
+2. **Handles multi-card (distributed) scenarios**  
+   For tests requiring multiple cards, it automatically adds distributed markers such as `distributed_cuda`, `distributed_rocm`, or `distributed_npu`.
+   ```
+   @pytest.mark.distributed_cuda(num_cards=num_cards)
+   ```
+3. **Supports flexible card requirements**  
+   Accepts `num_cards` as either a single integer for all platforms or as a dictionary with per-platform values. If not specified, defaults to 1 card per platform.
+
+4. **Integrates resource validation**  
+   On CUDA, adds a skip marker (`skipif_cuda`) if the system does not have the required number of devices.
+   Support for `skipif_rocm` and `skipif_npu` will be implemented later.
+
+
+5. **Runs each test in a new process**  
+   Automatically wraps the distributed test with a decorator (`@create_new_process_for_each_test`) to ensure isolation and compatibility with multi-process hardware backends.
+
+6. **Works with pytest filtering**  
+   Allows tests to be filtered and selected at runtime using standard pytest marker expressions (e.g., `-m "distributed_cuda and L4"`).
+
+#### Example usage for decorator
+- Single call for multiple platforms:
+    ```python
+    @hardware_test(
+        res={"cuda": "L4", "rocm": "MI325", "npu": "A2"},
+        num_cards={"cuda": 2, "rocm": 2, "npu": 2},
+    )
+    ```
+    or
+    ```python
+    @hardware_test(
+        res={"cuda": "L4", "rocm": "MI325", "npu": "A2"},
+        num_cards=2,
+    )
+    ```
+- `res` must be a dict; supported resources: CUDA (L4/H100), ROCm (MI325), NPU (A2/A3)
+- `num_cards` can be int (all platforms) or dict (per platform); defaults to 1 when missing
+- `hardware_test` automatically applies `@create_new_process_for_each_test` for distributed tests.
+- Distributed markers (`distributed_cuda`, `distributed_rocm`, `distributed_npu`) are auto-added for multi-card cases
+- Filtering examples:
+    - CUDA only: `pytest -m "distributed_cuda and L4"`
+    - ROCm only: `pytest -m "distributed_rocm and MI325"`
+    - NPU only: `pytest -m "distributed_npu"`
+
+## Add Support for a New Platform
+
+If you want to add support for a new platform (e.g., "tpu" for a new accelerator), follow these steps:
+
+1. **Extend the marker list in your pytest config** so that platform/resource markers are defined:
+   ```toml
+   # In pyproject.toml or pytest.ini
+   [tool.pytest.ini_options]
+   markers = [
+       # ... existing markers ...
+       "tpu: Tests that require TPU device",
+       "TPU_V3: Tests that require TPU v3 hardware",
+       "distributed_tpu: Tests that require multiple TPU devices",
+   ]
+   ```
+2. **Implement a marker construction function for your platform** in `vllm-omni/tests/utils.py`:
+   ```python
+   # In vllm-omni/tests/utils.py
+
+   def tpu_marks(*, res: str, num_cards: int):
+       test_platform = pytest.mark.tpu
+       if res == "TPU_V3":
+           test_resource = pytest.mark.TPU_V3
+       else:
+           raise ValueError(
+               f"Invalid TPU resource type: {res}. Supported: TPU_V3")
+
+       if num_cards == 1:
+           return [test_platform, test_resource]
+       else:
+           test_distributed = pytest.mark.distributed_tpu(num_cards=num_cards)
+           # Optionally: add skipif_tpu when implemented
+           return [test_platform, test_resource, test_distributed]
+   ```
+3. **Update `hardware_test` to recognize your new platform**:
+    In the relevant place (see the `hardware_test` implementation), add:
+    ```python
+    if platform == "tpu":
+        marks = tpu_marks(res=resource, num_cards=cards)
+    ```
+4. **(Recommended) Add a test using your new markers**:
+   ```python
+   @hardware_test(
+       res={"tpu": "TPU_V3"},
+       num_cards=2,
+   )
+   def test_my_tpu_feature():
+       ...
+   ```
+
+**Summary**:  
+- Add pytest markers for your new platform/resources  
+- Implement a marker function (`xxx_marks`)  
+- Plug into `hardware_test`  
+- You're done: tests decorated with `@hardware_test` using your platform now automatically get the correct markers, distribution, and isolation!
+
+See code in `vllm-omni/tests/utils.py` for existing examples (`cuda_marks`, `rocm_marks`, `npu_marks`).