vllm-project · linyueqian · May 4, 2026 · Mar 30, 2026 · Mar 26, 2026 · Mar 26, 2026
@@ -73,6 +73,17 @@ steps:
 #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 #     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model
 
+- label: "AudioX Online Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_audiox_online.py
+
 - label: "Diffusion Cache Backend Test"
   agent_pool: mi325_1
   depends_on: amd-build

@@ -120,6 +120,23 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
+  - label: "AudioX Online Test"
+    depends_on: upload-ready-pipeline
+    commands:
+      - timeout 20m pytest -s -v tests/e2e/online_serving/test_audiox_online.py -m "core_model and diffusion" --run-level core_model
+    agents:
+      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+            - "HF_TOKEN"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+
   - label: "Diffusion Cache Backend Test"
     depends_on: upload-ready-pipeline
     commands:

@@ -53,6 +53,7 @@ th {
 | `FluxPipeline` | FLUX.1-schnell | `black-forest-labs/FLUX.1-schnell` | ✅︎ | ✅︎ | | ✅︎ |
 | `OmniGen2Pipeline` | OmniGen2 | `OmniGen2/OmniGen2` | ✅︎ | ✅︎ | | ✅︎ |
 | `StableAudioPipeline` | Stable-Audio-Open | `stabilityai/stable-audio-open-1.0` | ✅︎ | ✅︎ | | ✅︎ |
+| `AudioXPipeline` | AudioX | `zhangj1an/AudioX` | ✅︎ | ✅︎ | | |
 | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-Base | `Qwen/Qwen3-TTS-12Hz-0.6B-Base` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |

@@ -0,0 +1,40 @@
+# AudioX offline inference
+
+Generate audio with the [AudioX](https://zeyuet.github.io/AudioX/) MMDiT diffusion
+pipeline (`AudioXPipeline`). Six tasks: `t2a`, `t2m`, `v2a`, `v2m`, `tv2a`, `tv2m`.
+
+## Prerequisites
+
+Download a vLLM-Omni weight bundle (component-sharded safetensors):
+
+```bash
+huggingface-cli download zhangj1an/AudioX --local-dir ./audiox_weights
+```
+
+The Hugging Face id `zhangj1an/AudioX` also works directly without prefetching.
+
+## Usage
+
+```bash
+# Text-to-audio only (default uses zhangj1an/AudioX from the Hub):
+python end2end.py --tasks t2a
+
+# All six tasks against a local bundle and a sample video for v2*/tv2*:
+python end2end.py \
+  --model ./audiox_weights \
+  --video https://zeyuet.github.io/AudioX/static/samples/V2M/1XeBotOFqHA.mp4
+
+# Subset of tasks, custom seed and steps:
+python end2end.py --tasks t2a tv2a --num-inference-steps 100 --seed 0
+```
+
+## Arguments
+
+- `--model`: HF id or local bundle path (default: `zhangj1an/AudioX`).
+- `--tasks`: any subset of `t2a t2m v2a v2m tv2a tv2m` (default: all).
+- `--video`: video file/URL — required for `v2*` and `tv2*`.
+- `--reference-audio`: optional audio prompt (audio-conditioned generation).
+- `--num-inference-steps`, `--guidance-scale`, `--seed`, `--seconds-total`,
+  `--sample-rate`, `--output-dir`: generation knobs.
+
+Outputs land in `<output-dir>/<task>.wav` as 16-bit stereo WAV.
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""End-to-end AudioX offline example covering the 6 t2*/v2*/tv2* tasks.
+
+Provide a directory with the **vLLM-Omni AudioX safetensors bundle** (e.g. from
+``zhangj1an/AudioX`` on Hugging Face)::
+
+    huggingface-cli download zhangj1an/AudioX --local-dir ./audiox_weights
+    python end2end.py --model ./audiox_weights
+    python end2end.py --model ./audiox_weights --tasks t2a tv2a
+"""
+
+from __future__ import annotations
+
+import argparse
+import time
+from pathlib import Path
+
+import soundfile
+import torch
+import torchaudio.functional as TF
+
+from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.platforms import current_omni_platform
+
+ROOT = Path(__file__).resolve().parent
+
+SAMPLE_PROMPTS: dict[str, str] = {
+    "t2a": "Fireworks burst twice, followed by a period of silence before a clock begins ticking.",
+    "t2m": "Uplifting ukulele tune for a travel vlog",
+    "v2a": "",
+    "v2m": "",
+    "tv2a": "drum beating sound and human talking",
+    "tv2m": "uplifting music matching the scene",
+}
+
+ALL_TASKS = ("t2a", "t2m", "v2a", "v2m", "tv2a", "tv2m")
+VIDEO_TASKS = frozenset({"v2a", "v2m", "tv2a", "tv2m"})
+TEXT_TASKS = frozenset({"t2a", "t2m", "tv2a", "tv2m"})
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="AudioX offline end-to-end (6 t2*/v2*/tv2* tasks).")
+    p.add_argument("--model", default="zhangj1an/AudioX", help="HF id or local AudioX bundle path.")
+    p.add_argument("--tasks", nargs="+", default=list(ALL_TASKS), choices=ALL_TASKS)
+    p.add_argument("--video", default="", help="Video path / URL (required for v2*/tv2*).")
+    p.add_argument("--reference-audio", default="", help="Optional audio prompt for audio-conditioned generation.")
+    p.add_argument("--output-dir", default=str(ROOT / "audiox_task_outputs"))
+    p.add_argument("--num-inference-steps", type=int, default=250)
+    p.add_argument("--seconds-total", type=float, default=10.0)
+    p.add_argument("--guidance-scale", type=float, default=6.0)
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--sample-rate", type=int, default=48000, help="Output WAV rate (resampled if != model rate).")
+    return p.parse_args()
+
+
+def save_wav(audio: torch.Tensor, path: Path, sample_rate: int) -> None:
+    """Write 16-bit PCM WAV. ``audio`` is ``[channels, samples]`` float in [-1, 1]."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    soundfile.write(str(path), audio.clamp(-1.0, 1.0).cpu().T.numpy(), sample_rate, subtype="PCM_16")
+
+
+def main() -> None:
+    args = parse_args()
+
+    omni = Omni(model=args.model, model_class_name="AudioXPipeline")
+
+    for task in args.tasks:
+        if task in VIDEO_TASKS and not args.video:
+            raise SystemExit(f"task={task!r} requires --video")
+        prompt = SAMPLE_PROMPTS[task] if task in TEXT_TASKS else ""
+        extra: dict = {"audiox_task": task, "seconds_start": 0.0, "seconds_total": float(args.seconds_total)}
+        if task in VIDEO_TASKS:
+            extra["video_path"] = args.video
+        if args.reference_audio:
+            extra["audio_path"] = args.reference_audio
+
+        generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
+        t0 = time.perf_counter()
+        outputs = omni.generate(
+            prompt,
+            OmniDiffusionSamplingParams(
+                generator=generator,
+                guidance_scale=args.guidance_scale,
+                num_inference_steps=args.num_inference_steps,
+                seed=args.seed,
+                extra_args=extra,
+            ),
+        )
+        audio = outputs[0].request_output.multimodal_output.get("audio")
+        if audio is None:
+            raise RuntimeError(f"No audio produced for task {task!r}")
+        audio = torch.as_tensor(audio).detach().cpu().float()
+        if audio.ndim == 3:
+            audio = audio[0]
+
+        model_sr = int(outputs[0].request_output.multimodal_output.get("audio_sample_rate") or 44100)
+        if model_sr != args.sample_rate:
+            audio = TF.resample(audio, model_sr, args.sample_rate)
+
+        out_path = Path(args.output_dir) / f"{task}.wav"
+        save_wav(audio, out_path, args.sample_rate)
+        print(f"[{task}] saved {out_path} ({time.perf_counter() - t0:.2f}s)")
+
+    omni.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,65 @@
+# AudioX online serving
+
+Launches the `AudioXPipeline` behind vLLM-Omni's OpenAI-compatible chat endpoint and provides a
+minimal Python client that covers all six tasks (`t2a`, `t2m`, `v2a`, `v2m`, `tv2a`, `tv2m`).
+
+## Start the server
+
+```bash
+cd examples/online_serving/audiox
+bash run_server.sh                 # defaults: MODEL=zhangj1an/AudioX, PORT=8099
+```
+
+Environment overrides: `MODEL`, `PORT`, `DIFFUSION_ATTENTION_BACKEND`.
+
+## Call from Python
+
+```bash
+# text-to-audio
+python openai_chat_client.py --task t2a \
+    --prompt "Fireworks burst twice, followed by a period of silence before a clock begins ticking." \
+    --output t2a.wav
+
+# text-to-music
+python openai_chat_client.py --task t2m \
+    --prompt "Uplifting ukulele tune for a travel vlog" \
+    --output t2m.wav
+
+# video-to-audio (no text)
+python openai_chat_client.py --task v2a --video path/to/clip.mp4 --output v2a.wav
+
+# text+video-to-audio
+python openai_chat_client.py --task tv2a \
+    --prompt "drum beating sound and human talking" \
+    --video path/to/clip.mp4 \
+    --output tv2a.wav
+```
+
+The client sends:
+
+- `num_inference_steps`, `guidance_scale`, `seed` as first-class OpenAI chat-completion fields
+- `audiox_task`, `seconds_start`, `seconds_total`, `sigma_min`, `sigma_max` nested under
+  `extra_args` (a reserved dict on the request body that the server forwards verbatim into
+  the pipeline's `sampling_params.extra_args` — the same escape hatch `serving_video.py` exposes
+  as `extra_params` on /v1/videos)
+- For `v2*` / `tv2*` tasks, the video as a `video_url` content item (data URI for local files)
+
+## curl
+
+```bash
+curl -sS -X POST http://localhost:8099/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "model": "zhangj1an/AudioX",
+    "messages": [{"role": "user", "content": [{"type": "text", "text": "Uplifting ukulele"}]}],
+    "num_inference_steps": 250,
+    "guidance_scale": 7.0,
+    "seed": 42,
+    "extra_args": {
+      "audiox_task": "t2m",
+      "seconds_total": 10.0,
+      "sigma_min": 0.3,
+      "sigma_max": 500.0
+    }
+  }' > t2m.json
+```
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""AudioX OpenAI-compatible chat client.
+
+AudioX supports 6 tasks (t2a, t2m, v2a, v2m, tv2a, tv2m). Text-only tasks send the prompt as the
+chat message; video-conditioned tasks additionally attach the video via a ``video_url`` content
+item (data URI for local files). Task + generation knobs (steps, cfg, sigma range, seconds, seed)
+are sent via the OpenAI SDK's ``extra_body`` as ``extra_args`` — the same pipeline-agnostic escape
+hatch used by the /v1/videos endpoint's ``extra_params`` field.
+
+Usage:
+  python openai_chat_client.py --task t2a --prompt "Fireworks burst twice..." --output t2a.wav
+  python openai_chat_client.py --task tv2a --prompt "drum beating" --video clip.mp4 -o tv2a.wav
+"""
+
+from __future__ import annotations
+
+import argparse
+import base64
+import io
+import mimetypes
+import sys
+from pathlib import Path
+
+import requests
+import soundfile
+import torch
+
+VIDEO_TASKS = frozenset({"v2a", "v2m", "tv2a", "tv2m"})
+TEXT_TASKS = frozenset({"t2a", "t2m", "tv2a", "tv2m"})
+
+
+def _to_data_url(path: str) -> str:
+    mime, _ = mimetypes.guess_type(path)
+    mime = mime or "video/mp4"
+    with open(path, "rb") as f:
+        data = base64.b64encode(f.read()).decode("ascii")
+    return f"data:{mime};base64,{data}"
+
+
+def _save_wav(audio: torch.Tensor, path: Path, sample_rate: int) -> None:
+    audio = audio.to(torch.float32)
+    audio = audio / audio.abs().max().clamp(min=1e-8)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    # soundfile expects channels-last (T, C); project convention is (C, T).
+    soundfile.write(str(path), audio.clamp(-1.0, 1.0).cpu().T.numpy(), sample_rate, subtype="PCM_16")
+
+
+def _decode_audio_from_response(body: dict) -> tuple[torch.Tensor, int]:
+    for choice in body.get("choices", []):
+        audio_obj = choice.get("message", {}).get("audio")
+        if not (isinstance(audio_obj, dict) and audio_obj.get("data")):
+            continue
+        data, sr = soundfile.read(io.BytesIO(base64.b64decode(audio_obj["data"])), dtype="float32", always_2d=True)
+        return torch.from_numpy(data).transpose(0, 1), sr
+    brief = {k: v for k, v in body.items() if k != "choices"}
+    raise RuntimeError(f"no audio in response message.audio: {brief}")
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description="AudioX OpenAI chat client")
+    p.add_argument("--task", required=True, choices=["t2a", "t2m", "v2a", "v2m", "tv2a", "tv2m"])
+    p.add_argument("--prompt", "-p", default="", help="Text prompt (required for t2*/tv2*).")
+    p.add_argument("--video", help="Video path or URL (required for v2*/tv2*).")
+    p.add_argument("--output", "-o", default="audiox_out.wav")
+    p.add_argument("--server", "-s", default="http://localhost:8099")
+    p.add_argument("--model", default="zhangj1an/AudioX")
+    p.add_argument("--steps", type=int, default=250)
+    p.add_argument("--guidance-scale", type=float, default=7.0)
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--seconds-total", type=float, default=10.0)
+    p.add_argument("--seconds-start", type=float, default=0.0)
+    p.add_argument("--sigma-min", type=float, default=0.03)
+    p.add_argument("--sigma-max", type=float, default=1000.0)
+    args = p.parse_args()
+
+    if args.task in VIDEO_TASKS and not args.video:
+        print(f"ERROR: task {args.task!r} requires --video", file=sys.stderr)
+        return 2
+    if args.task in TEXT_TASKS and not args.prompt.strip() and args.task not in {"v2a", "v2m"}:
+        print(f"ERROR: task {args.task!r} requires --prompt", file=sys.stderr)
+        return 2
+
+    content: list[dict] = [{"type": "text", "text": args.prompt}]
+    if args.task in VIDEO_TASKS:
+        vurl = args.video if args.video.startswith(("http://", "https://")) else _to_data_url(args.video)
+        content.append({"type": "video_url", "video_url": {"url": vurl}})
+
+    payload = {
+        "model": args.model,
+        "messages": [{"role": "user", "content": content}],
+        "num_inference_steps": args.steps,
+        "guidance_scale": args.guidance_scale,
+        "seed": args.seed,
+        "extra_args": {
+            "audiox_task": args.task,
+            "seconds_start": args.seconds_start,
+            "seconds_total": args.seconds_total,
+            "sigma_min": args.sigma_min,
+            "sigma_max": args.sigma_max,
+        },
+    }
+
+    print(f"POST {args.server}/v1/chat/completions  task={args.task} steps={args.steps}")
+    r = requests.post(
+        f"{args.server}/v1/chat/completions",
+        headers={"Content-Type": "application/json"},
+        json=payload,
+        timeout=600,
+    )
+    r.raise_for_status()
+    audio, sr = _decode_audio_from_response(r.json())
+    _save_wav(audio, Path(args.output), sr)
+    dur = audio.shape[-1] / sr
+    print(f"saved {args.output}  sr={sr}Hz  duration={dur:.2f}s  channels={audio.shape[0]}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())