Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ COPY . .

# Install system dependencies
RUN apt-get update && \
apt-get install -y espeak-ng git sox libsox-fmt-all jq && \
apt-get install -y espeak-ng git jq && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.cuda
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ WORKDIR ${COMMON_WORKDIR}

# Step 1: Setup - Install system dependencies
RUN apt-get update && \
apt-get install -y git sox libsox-fmt-all jq && \
apt-get install -y git jq && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ WORKDIR ${COMMON_WORKDIR}
# Need to include ffmpeg because vllm rocm upstream docker image
# does not include it.
RUN apt-get update && \
apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \
apt-get install -y espeak-ng ffmpeg git jq && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

Expand Down
2 changes: 1 addition & 1 deletion docs/getting_started/installation/gpu/rocm.inc.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ uv pip install vllm-omni

# Optional if want to run Qwen3 TTS
uv pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
uv pip install onnxruntime-rocm sox
uv pip install onnxruntime-rocm
```

# --8<-- [end:pre-built-wheels]
Expand Down
4 changes: 2 additions & 2 deletions docs/user_guide/examples/offline_inference/qwen3_tts.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro

### ROCm Dependencies

You will need to install these two dependencies `onnxruntime-rocm` and `sox`.
You will need to install the dependency `onnxruntime-rocm`.

```
pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
pip install onnxruntime-rocm sox
pip install onnxruntime-rocm
```

## Quick Start
Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/qwen3_tts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro

### ROCm Dependencies

You will need to install these two dependencies `onnxruntime-rocm` and `sox`.
You will need to install the dependency `onnxruntime-rocm`.

```
pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
pip install onnxruntime-rocm sox
pip install onnxruntime-rocm
```

## Quick Start
Expand Down
1 change: 0 additions & 1 deletion requirements/common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ tqdm>=4.66.0
torchsde>=0.2.6
openai-whisper>=20250625
imageio[ffmpeg]>=2.37.2
sox>=1.5.0
x-transformers>=2.12.2
einops>=0.8.1
prettytable>=3.8.0
Expand Down
22 changes: 20 additions & 2 deletions tests/utils/test_audio.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""Unit tests for vllm_omni.utils.audio.mel_filter_bank."""
"""Unit tests for vllm_omni.utils.audio."""

import numpy as np
import pytest
import torch

from vllm_omni.utils.audio import mel_filter_bank
from vllm_omni.utils.audio import mel_filter_bank, peak_normalize

# Parameter combinations used across the codebase.
_PARAM_SETS = [
Expand Down Expand Up @@ -59,3 +60,20 @@ def test_higher_fmax_extends_coverage(self):
last_nonzero_low = (fb_low.sum(dim=0) > 0).nonzero()[-1].item()
last_nonzero_high = (fb_high.sum(dim=0) > 0).nonzero()[-1].item()
assert last_nonzero_high > last_nonzero_low


class TestPeakNormalize:
def test_silence_unchanged(self):
"""All-zero input should remain all-zero."""
audio = np.zeros(1600, dtype=np.float32)
result = peak_normalize(audio, db_level=-6.0)
np.testing.assert_array_equal(result, audio)

def test_peak_reaches_target(self):
"""After normalization, peak amplitude should be at target dB."""
rng = np.random.default_rng(7)
audio = rng.uniform(-0.4, 0.4, size=16000).astype(np.float32)

result = peak_normalize(audio, db_level=-6.0)
peak_db = 20 * np.log10(np.abs(result).max())
np.testing.assert_allclose(peak_db, -6.0, atol=1e-4)
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@
from itertools import accumulate

import onnxruntime
import sox
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio.compliance.kaldi as kaldi
from torch import Tensor

from vllm_omni.utils.audio import mel_filter_bank
from vllm_omni.utils.audio import mel_filter_bank, peak_normalize

from .core_vq import DistributedGroupResidualVectorQuantization
from .whisper_encoder import Conv1d, ConvTranspose1d, WhisperEncoder
Expand Down Expand Up @@ -153,9 +152,6 @@ def __init__(self, audio_codec_with_xvector):
audio_codec_with_xvector, sess_options=option, providers=providers
)

self.tfm = sox.Transformer()
self.tfm.norm(db_level=-6)

self.mel_ext = MelSpectrogramFeatures(
filter_length=1024,
hop_length=160,
Expand Down Expand Up @@ -183,8 +179,7 @@ def extract_code(self, audio):
return norm_embedding.numpy(), ref_mel.permute(0, 2, 1).squeeze(0).numpy()

def sox_norm(self, audio):
wav_norm = self.tfm.build_array(input_array=audio, sample_rate_in=16000)
return wav_norm
return peak_normalize(audio, db_level=-6)


class WhisperEncoderVQ(WhisperEncoder):
Expand Down
23 changes: 23 additions & 0 deletions vllm_omni/utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"""Audio utility functions shared across models and entrypoints."""

import numpy as np
import torch
from torchaudio.functional import melscale_fbanks

Expand Down Expand Up @@ -43,3 +44,25 @@ def mel_filter_bank(
mel_scale="slaney",
norm="slaney",
).T


def peak_normalize(
audio: np.ndarray,
db_level: float = -6.0,
) -> np.ndarray:
"""Normalize audio so peak amplitude reaches a target dB level.

Drop-in replacement for ``sox.Transformer().norm(db_level=...)``.

Args:
audio: Input waveform as a 1-D numpy array.
db_level: Target peak amplitude in dBFS.

Returns:
Normalized waveform with the same dtype as *audio*.
"""
peak = np.abs(audio).max()
if peak == 0:
return audio
target = 10.0 ** (db_level / 20.0)
return audio * (target / peak)
Loading