Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
105 commits
Select commit Hold shift + click to select a range
39a1df1
merge main
zhangj1an Mar 30, 2026
ca7585f
address neg prompt
zhangj1an Mar 26, 2026
a69f449
set default output
zhangj1an Mar 26, 2026
b612dcf
allow batch output
zhangj1an Mar 26, 2026
31b206d
pipeline runs e2e after code cleanup
zhangj1an Mar 26, 2026
43eeaad
pass precommit check
zhangj1an Mar 26, 2026
9af08c8
run task e2e
zhangj1an Mar 27, 2026
001129f
add e2e test
zhangj1an Mar 27, 2026
5cda5ab
update unit test
zhangj1an Mar 27, 2026
a5b2b3e
remove redundant protobuf
zhangj1an Mar 27, 2026
9f1c2a6
add a shared oobleck base class
zhangj1an Mar 27, 2026
2ba2c5a
remove unnecessary branch in runtime
zhangj1an Mar 27, 2026
22d7cc3
add shared FFT
zhangj1an Mar 27, 2026
c5f752f
add shared FFT, shared VAE, use shared progress bar
zhangj1an Mar 27, 2026
fd0803a
abstract out audio post process helpers
zhangj1an Mar 27, 2026
6960b50
remove unnecessary changes in data.py
Mar 27, 2026
7e06736
remove redundant sdpa code
Mar 27, 2026
846d29a
undo changes to git ignore
Mar 27, 2026
f555876
put config in end2end.py
Mar 27, 2026
e388bbf
remove redundant serve args
Mar 27, 2026
c7c8f89
seperate out audio and video processing
Mar 27, 2026
be3c8c4
remove redundant if else code
Mar 27, 2026
5adc01c
remove training code
Mar 27, 2026
69c2b49
pass pre commit check
Mar 27, 2026
f92b51c
update doc
Mar 27, 2026
775ef4f
add audiox into buildkite
Mar 27, 2026
8fab5c4
simple rename
Mar 27, 2026
a1c9eb6
make pipeline run e2e
Mar 27, 2026
d88448a
update env to match 0.18.0
zhangj1an Mar 30, 2026
118b517
merge main
zhangj1an Mar 30, 2026
7814b96
merge main
zhangj1an Mar 30, 2026
ba1496f
Merge branch 'main' into jian/audiox
zhangj1an Mar 30, 2026
09fc955
add online serving
zhangj1an Mar 30, 2026
fc35a94
make online serving work
zhangj1an Mar 30, 2026
87faa8c
run example tasks
zhangj1an Mar 30, 2026
6374983
bump audiox dependency
zhangj1an Mar 31, 2026
bd048ef
match audiox with upstream behaviour
zhangj1an Mar 31, 2026
4c1aac9
fix weight remapping
zhangj1an Mar 31, 2026
d69a2b5
fix weight mapping and float16
zhangj1an Mar 31, 2026
3675d14
fixed most of the disparity in DiT
zhangj1an Mar 31, 2026
773ca0b
remove comments
zhangj1an Mar 31, 2026
48e7a01
Merge branch 'main' into jian/audiox
zhangj1an Mar 31, 2026
c1fe23b
branch is fixed yay
zhangj1an Mar 31, 2026
4aed391
fixed pre commit
zhangj1an Mar 31, 2026
6b86bb5
remove irrelevant change
zhangj1an Mar 31, 2026
46f7623
Merge upstream/main
zhangj1an Mar 31, 2026
efd875f
remove helper file
zhangj1an Mar 31, 2026
92ecc2c
fix pre commit
zhangj1an Mar 31, 2026
a0f5cab
fix video conditioning discrepancy
zhangj1an Mar 31, 2026
7da95fb
fix pre commit
zhangj1an Mar 31, 2026
61d6b8c
Merge branch 'main' into jian/audiox
zhangj1an Mar 31, 2026
8185287
fix video conditioning bug
zhangj1an Mar 31, 2026
a5eea7d
Merge branch 'main' into jian/audiox
zhangj1an Mar 31, 2026
30f07e0
merge main
zhangj1an Apr 6, 2026
aa18ba7
remove redundant test
zhangj1an Apr 6, 2026
0602eff
remove ckpt to safetensors conversion
zhangj1an Apr 6, 2026
51e1728
remove oobleck boiler plate code
zhangj1an Apr 6, 2026
2564824
remove redundant oobleck encoder code
zhangj1an Apr 6, 2026
8b0c7f5
centralise weight remapping
zhangj1an Apr 6, 2026
a666db6
remove custom weight loading code
zhangj1an Apr 6, 2026
3fbfb7e
remove boilter plate code
zhangj1an Apr 6, 2026
bf5d781
remove pretransform file
zhangj1an Apr 6, 2026
366dbcf
remove unused fork / del code
zhangj1an Apr 6, 2026
2543891
remove training only code
zhangj1an Apr 6, 2026
ce87555
remove online training code
zhangj1an Apr 6, 2026
96088d2
Merge branch 'main' into jian/audiox
zhangj1an Apr 6, 2026
e3bb2d2
merge main
zhangj1an Apr 11, 2026
e06cd55
remove weight loading redundant key mapping
zhangj1an Apr 11, 2026
c7d7b51
Merge remote-tracking branch 'origin/main' into jian/audiox
zhangj1an Apr 13, 2026
3fd8461
remove glues
zhangj1an Apr 13, 2026
40f65b0
remove custom clip implementation
zhangj1an Apr 13, 2026
001b946
merge main
zhangj1an Apr 15, 2026
339177b
fix weight loading keys
zhangj1an Apr 16, 2026
a868329
adhere to format of 2 files only
zhangj1an Apr 16, 2026
e6b31c0
remove docs/user_guide/examples/offline_inference/xxx.md
zhangj1an Apr 16, 2026
fcf28ce
add online tests
zhangj1an Apr 16, 2026
18154a7
allow output of audios in online serving
zhangj1an Apr 16, 2026
2e06f3c
remove unnecessary wrappers
zhangj1an Apr 16, 2026
4db5c54
remove audio editing support
zhangj1an Apr 16, 2026
97e19cd
remove unused code
zhangj1an Apr 16, 2026
3a4a9ee
clean up sample file
zhangj1an Apr 16, 2026
72ae36e
use diffuser to run k sampler
zhangj1an Apr 16, 2026
bdb1fbc
add back support for audio input
zhangj1an Apr 16, 2026
3309653
pipeline can run e2e
zhangj1an Apr 17, 2026
f7033f8
remove redundant comment
zhangj1an Apr 17, 2026
deff0cd
online serving works
zhangj1an Apr 17, 2026
cdf530a
add TP
zhangj1an Apr 17, 2026
63ddcff
remove dependency for k-diffusion and use media helpers
zhangj1an Apr 17, 2026
969172c
update unit test
zhangj1an Apr 17, 2026
c4e5c8d
Merge branch 'main' into jian/audiox
zhangj1an Apr 17, 2026
bdd1512
use diffuser attention
zhangj1an Apr 17, 2026
b06e578
use einops
zhangj1an Apr 18, 2026
8ffde04
clean up code
zhangj1an Apr 18, 2026
b9eeffb
merg main
zhangj1an Apr 18, 2026
035dab9
resolve conflicts and merge main
zhangj1an Apr 22, 2026
716d6e1
Merge branch 'main' into jian/audiox
princepride May 1, 2026
6a149a6
merge main
zhangj1an May 3, 2026
a07c949
add aduiox recipe
zhangj1an May 3, 2026
88e6f63
clean up code
zhangj1an May 3, 2026
145f070
add audiox_random weights to pass test
zhangj1an May 3, 2026
409c4da
fix dtype mismatch that fails the CI
zhangj1an May 3, 2026
3f4a6c2
assign audiox to the correct level
zhangj1an May 3, 2026
51ba5be
Merge remote-tracking branch 'origin/main' into jian/audiox
zhangj1an May 4, 2026
b96a6ab
replaced torchvision with pyav
zhangj1an May 4, 2026
c134483
shift imports to file head
zhangj1an May 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .buildkite/test-amd-ready.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,17 @@ steps:
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model

- label: "AudioX Online Test"
agent_pool: mi325_1
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- timeout 20m pytest -s -v tests/e2e/online_serving/test_audiox_online.py

- label: "Diffusion Cache Backend Test"
agent_pool: mi325_1
depends_on: amd-build
Expand Down
17 changes: 17 additions & 0 deletions .buildkite/test-ready.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,23 @@ steps:
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

- label: "AudioX Online Test"
depends_on: upload-ready-pipeline
commands:
- timeout 20m pytest -s -v tests/e2e/online_serving/test_audiox_online.py -m "core_model and diffusion" --run-level core_model
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
- "HF_TOKEN"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

- label: "Diffusion Cache Backend Test"
depends_on: upload-ready-pipeline
commands:
Expand Down
1 change: 1 addition & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ th {
| `FluxPipeline` | FLUX.1-schnell | `black-forest-labs/FLUX.1-schnell` | ✅︎ | ✅︎ | | ✅︎ |
| `OmniGen2Pipeline` | OmniGen2 | `OmniGen2/OmniGen2` | ✅︎ | ✅︎ | | ✅︎ |
| `StableAudioPipeline` | Stable-Audio-Open | `stabilityai/stable-audio-open-1.0` | ✅︎ | ✅︎ | | ✅︎ |
| `AudioXPipeline` | AudioX | `zhangj1an/AudioX` | ✅︎ | ✅︎ | | |
| `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
| `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
| `Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-Base | `Qwen/Qwen3-TTS-12Hz-0.6B-Base` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
Expand Down
40 changes: 40 additions & 0 deletions examples/offline_inference/audiox/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# AudioX offline inference

Generate audio with the [AudioX](https://zeyuet.github.io/AudioX/) MMDiT diffusion
pipeline (`AudioXPipeline`). Six tasks: `t2a`, `t2m`, `v2a`, `v2m`, `tv2a`, `tv2m`.

## Prerequisites

Download a vLLM-Omni weight bundle (component-sharded safetensors):

```bash
huggingface-cli download zhangj1an/AudioX --local-dir ./audiox_weights
```

The Hugging Face id `zhangj1an/AudioX` also works directly without prefetching.

## Usage

```bash
# Text-to-audio only (default uses zhangj1an/AudioX from the Hub):
python end2end.py --tasks t2a

# All six tasks against a local bundle and a sample video for v2*/tv2*:
python end2end.py \
--model ./audiox_weights \
--video https://zeyuet.github.io/AudioX/static/samples/V2M/1XeBotOFqHA.mp4

# Subset of tasks, custom seed and steps:
python end2end.py --tasks t2a tv2a --num-inference-steps 100 --seed 0
```

## Arguments

- `--model`: HF id or local bundle path (default: `zhangj1an/AudioX`).
- `--tasks`: any subset of `t2a t2m v2a v2m tv2a tv2m` (default: all).
- `--video`: video file/URL — required for `v2*` and `tv2*`.
- `--reference-audio`: optional audio prompt (audio-conditioned generation).
- `--num-inference-steps`, `--guidance-scale`, `--seed`, `--seconds-total`,
`--sample-rate`, `--output-dir`: generation knobs.

Outputs land in `<output-dir>/<task>.wav` as 16-bit stereo WAV.
110 changes: 110 additions & 0 deletions examples/offline_inference/audiox/end2end.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""End-to-end AudioX offline example covering the 6 t2*/v2*/tv2* tasks.

Provide a directory with the **vLLM-Omni AudioX safetensors bundle** (e.g. from
``zhangj1an/AudioX`` on Hugging Face)::

huggingface-cli download zhangj1an/AudioX --local-dir ./audiox_weights
python end2end.py --model ./audiox_weights
python end2end.py --model ./audiox_weights --tasks t2a tv2a
"""

from __future__ import annotations

import argparse
import time
from pathlib import Path

import soundfile
import torch
import torchaudio.functional as TF

from vllm_omni.entrypoints.omni import Omni
from vllm_omni.inputs.data import OmniDiffusionSamplingParams
from vllm_omni.platforms import current_omni_platform

ROOT = Path(__file__).resolve().parent

SAMPLE_PROMPTS: dict[str, str] = {
"t2a": "Fireworks burst twice, followed by a period of silence before a clock begins ticking.",
"t2m": "Uplifting ukulele tune for a travel vlog",
"v2a": "",
"v2m": "",
"tv2a": "drum beating sound and human talking",
"tv2m": "uplifting music matching the scene",
}

ALL_TASKS = ("t2a", "t2m", "v2a", "v2m", "tv2a", "tv2m")
VIDEO_TASKS = frozenset({"v2a", "v2m", "tv2a", "tv2m"})
TEXT_TASKS = frozenset({"t2a", "t2m", "tv2a", "tv2m"})


def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="AudioX offline end-to-end (6 t2*/v2*/tv2* tasks).")
p.add_argument("--model", default="zhangj1an/AudioX", help="HF id or local AudioX bundle path.")
p.add_argument("--tasks", nargs="+", default=list(ALL_TASKS), choices=ALL_TASKS)
p.add_argument("--video", default="", help="Video path / URL (required for v2*/tv2*).")
p.add_argument("--reference-audio", default="", help="Optional audio prompt for audio-conditioned generation.")
p.add_argument("--output-dir", default=str(ROOT / "audiox_task_outputs"))
p.add_argument("--num-inference-steps", type=int, default=250)
p.add_argument("--seconds-total", type=float, default=10.0)
p.add_argument("--guidance-scale", type=float, default=6.0)
p.add_argument("--seed", type=int, default=42)
p.add_argument("--sample-rate", type=int, default=48000, help="Output WAV rate (resampled if != model rate).")
return p.parse_args()


def save_wav(audio: torch.Tensor, path: Path, sample_rate: int) -> None:
"""Write 16-bit PCM WAV. ``audio`` is ``[channels, samples]`` float in [-1, 1]."""
path.parent.mkdir(parents=True, exist_ok=True)
soundfile.write(str(path), audio.clamp(-1.0, 1.0).cpu().T.numpy(), sample_rate, subtype="PCM_16")


def main() -> None:
args = parse_args()

omni = Omni(model=args.model, model_class_name="AudioXPipeline")

for task in args.tasks:
if task in VIDEO_TASKS and not args.video:
raise SystemExit(f"task={task!r} requires --video")
prompt = SAMPLE_PROMPTS[task] if task in TEXT_TASKS else ""
extra: dict = {"audiox_task": task, "seconds_start": 0.0, "seconds_total": float(args.seconds_total)}
if task in VIDEO_TASKS:
extra["video_path"] = args.video
if args.reference_audio:
extra["audio_path"] = args.reference_audio

generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
t0 = time.perf_counter()
outputs = omni.generate(
prompt,
OmniDiffusionSamplingParams(
generator=generator,
guidance_scale=args.guidance_scale,
num_inference_steps=args.num_inference_steps,
seed=args.seed,
extra_args=extra,
),
)
audio = outputs[0].request_output.multimodal_output.get("audio")
if audio is None:
raise RuntimeError(f"No audio produced for task {task!r}")
audio = torch.as_tensor(audio).detach().cpu().float()
if audio.ndim == 3:
audio = audio[0]

model_sr = int(outputs[0].request_output.multimodal_output.get("audio_sample_rate") or 44100)
if model_sr != args.sample_rate:
audio = TF.resample(audio, model_sr, args.sample_rate)

out_path = Path(args.output_dir) / f"{task}.wav"
save_wav(audio, out_path, args.sample_rate)
print(f"[{task}] saved {out_path} ({time.perf_counter() - t0:.2f}s)")

omni.close()


if __name__ == "__main__":
main()
65 changes: 65 additions & 0 deletions examples/online_serving/audiox/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# AudioX online serving

Launches the `AudioXPipeline` behind vLLM-Omni's OpenAI-compatible chat endpoint and provides a
minimal Python client that covers all six tasks (`t2a`, `t2m`, `v2a`, `v2m`, `tv2a`, `tv2m`).

## Start the server

```bash
cd examples/online_serving/audiox
bash run_server.sh # defaults: MODEL=zhangj1an/AudioX, PORT=8099
```

Environment overrides: `MODEL`, `PORT`, `DIFFUSION_ATTENTION_BACKEND`.

## Call from Python

```bash
# text-to-audio
python openai_chat_client.py --task t2a \
--prompt "Fireworks burst twice, followed by a period of silence before a clock begins ticking." \
--output t2a.wav

# text-to-music
python openai_chat_client.py --task t2m \
--prompt "Uplifting ukulele tune for a travel vlog" \
--output t2m.wav

# video-to-audio (no text)
python openai_chat_client.py --task v2a --video path/to/clip.mp4 --output v2a.wav

# text+video-to-audio
python openai_chat_client.py --task tv2a \
--prompt "drum beating sound and human talking" \
--video path/to/clip.mp4 \
--output tv2a.wav
```

The client sends:

- `num_inference_steps`, `guidance_scale`, `seed` as first-class OpenAI chat-completion fields
- `audiox_task`, `seconds_start`, `seconds_total`, `sigma_min`, `sigma_max` nested under
`extra_args` (a reserved dict on the request body that the server forwards verbatim into
the pipeline's `sampling_params.extra_args` — the same escape hatch `serving_video.py` exposes
as `extra_params` on /v1/videos)
- For `v2*` / `tv2*` tasks, the video as a `video_url` content item (data URI for local files)

## curl

```bash
curl -sS -X POST http://localhost:8099/v1/chat/completions \
-H 'Content-Type: application/json' \
-d '{
"model": "zhangj1an/AudioX",
"messages": [{"role": "user", "content": [{"type": "text", "text": "Uplifting ukulele"}]}],
"num_inference_steps": 250,
"guidance_scale": 7.0,
"seed": 42,
"extra_args": {
"audiox_task": "t2m",
"seconds_total": 10.0,
"sigma_min": 0.3,
"sigma_max": 500.0
}
}' > t2m.json
```
119 changes: 119 additions & 0 deletions examples/online_serving/audiox/openai_chat_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""AudioX OpenAI-compatible chat client.

AudioX supports 6 tasks (t2a, t2m, v2a, v2m, tv2a, tv2m). Text-only tasks send the prompt as the
chat message; video-conditioned tasks additionally attach the video via a ``video_url`` content
item (data URI for local files). Task + generation knobs (steps, cfg, sigma range, seconds, seed)
are sent via the OpenAI SDK's ``extra_body`` as ``extra_args`` — the same pipeline-agnostic escape
hatch used by the /v1/videos endpoint's ``extra_params`` field.

Usage:
python openai_chat_client.py --task t2a --prompt "Fireworks burst twice..." --output t2a.wav
python openai_chat_client.py --task tv2a --prompt "drum beating" --video clip.mp4 -o tv2a.wav
"""

from __future__ import annotations

import argparse
import base64
import io
import mimetypes
import sys
from pathlib import Path

import requests
import soundfile
import torch

VIDEO_TASKS = frozenset({"v2a", "v2m", "tv2a", "tv2m"})
TEXT_TASKS = frozenset({"t2a", "t2m", "tv2a", "tv2m"})


def _to_data_url(path: str) -> str:
mime, _ = mimetypes.guess_type(path)
mime = mime or "video/mp4"
with open(path, "rb") as f:
data = base64.b64encode(f.read()).decode("ascii")
return f"data:{mime};base64,{data}"


def _save_wav(audio: torch.Tensor, path: Path, sample_rate: int) -> None:
audio = audio.to(torch.float32)
audio = audio / audio.abs().max().clamp(min=1e-8)
path.parent.mkdir(parents=True, exist_ok=True)
# soundfile expects channels-last (T, C); project convention is (C, T).
soundfile.write(str(path), audio.clamp(-1.0, 1.0).cpu().T.numpy(), sample_rate, subtype="PCM_16")


def _decode_audio_from_response(body: dict) -> tuple[torch.Tensor, int]:
for choice in body.get("choices", []):
audio_obj = choice.get("message", {}).get("audio")
if not (isinstance(audio_obj, dict) and audio_obj.get("data")):
continue
data, sr = soundfile.read(io.BytesIO(base64.b64decode(audio_obj["data"])), dtype="float32", always_2d=True)
return torch.from_numpy(data).transpose(0, 1), sr
brief = {k: v for k, v in body.items() if k != "choices"}
raise RuntimeError(f"no audio in response message.audio: {brief}")


def main() -> int:
p = argparse.ArgumentParser(description="AudioX OpenAI chat client")
p.add_argument("--task", required=True, choices=["t2a", "t2m", "v2a", "v2m", "tv2a", "tv2m"])
p.add_argument("--prompt", "-p", default="", help="Text prompt (required for t2*/tv2*).")
p.add_argument("--video", help="Video path or URL (required for v2*/tv2*).")
p.add_argument("--output", "-o", default="audiox_out.wav")
p.add_argument("--server", "-s", default="http://localhost:8099")
p.add_argument("--model", default="zhangj1an/AudioX")
p.add_argument("--steps", type=int, default=250)
p.add_argument("--guidance-scale", type=float, default=7.0)
p.add_argument("--seed", type=int, default=42)
p.add_argument("--seconds-total", type=float, default=10.0)
p.add_argument("--seconds-start", type=float, default=0.0)
p.add_argument("--sigma-min", type=float, default=0.03)
p.add_argument("--sigma-max", type=float, default=1000.0)
args = p.parse_args()

if args.task in VIDEO_TASKS and not args.video:
print(f"ERROR: task {args.task!r} requires --video", file=sys.stderr)
return 2
if args.task in TEXT_TASKS and not args.prompt.strip() and args.task not in {"v2a", "v2m"}:
print(f"ERROR: task {args.task!r} requires --prompt", file=sys.stderr)
return 2

content: list[dict] = [{"type": "text", "text": args.prompt}]
if args.task in VIDEO_TASKS:
vurl = args.video if args.video.startswith(("http://", "https://")) else _to_data_url(args.video)
content.append({"type": "video_url", "video_url": {"url": vurl}})

payload = {
"model": args.model,
"messages": [{"role": "user", "content": content}],
"num_inference_steps": args.steps,
"guidance_scale": args.guidance_scale,
"seed": args.seed,
"extra_args": {
"audiox_task": args.task,
"seconds_start": args.seconds_start,
"seconds_total": args.seconds_total,
"sigma_min": args.sigma_min,
"sigma_max": args.sigma_max,
},
}

print(f"POST {args.server}/v1/chat/completions task={args.task} steps={args.steps}")
r = requests.post(
f"{args.server}/v1/chat/completions",
headers={"Content-Type": "application/json"},
json=payload,
timeout=600,
)
r.raise_for_status()
audio, sr = _decode_audio_from_response(r.json())
_save_wav(audio, Path(args.output), sr)
dur = audio.shape[-1] / sr
print(f"saved {args.output} sr={sr}Hz duration={dur:.2f}s channels={audio.shape[0]}")
return 0


if __name__ == "__main__":
raise SystemExit(main())
Loading
Loading