From 50d22a4b77f40fe9eaf5028b19a777a8c8361420 Mon Sep 17 00:00:00 2001 From: Nick Cao Date: Mon, 13 Apr 2026 12:12:28 -0400 Subject: [PATCH 1/4] [Feat] Add peak_normalize utility to vllm_omni.utils.audio Drop-in replacement for sox.Transformer().norm(db_level=...). Scales audio so peak amplitude reaches a target dB level. Co-authored-by: Claude Signed-off-by: Nick Cao --- tests/utils/test_audio.py | 22 ++++++++++++++++++++-- vllm_omni/utils/audio.py | 23 +++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_audio.py b/tests/utils/test_audio.py index cfbd2501b25..0e483e64685 100644 --- a/tests/utils/test_audio.py +++ b/tests/utils/test_audio.py @@ -1,12 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for vllm_omni.utils.audio.mel_filter_bank.""" +"""Unit tests for vllm_omni.utils.audio.""" +import numpy as np import pytest import torch -from vllm_omni.utils.audio import mel_filter_bank +from vllm_omni.utils.audio import mel_filter_bank, peak_normalize # Parameter combinations used across the codebase. _PARAM_SETS = [ @@ -59,3 +60,20 @@ def test_higher_fmax_extends_coverage(self): last_nonzero_low = (fb_low.sum(dim=0) > 0).nonzero()[-1].item() last_nonzero_high = (fb_high.sum(dim=0) > 0).nonzero()[-1].item() assert last_nonzero_high > last_nonzero_low + + +class TestPeakNormalize: + def test_silence_unchanged(self): + """All-zero input should remain all-zero.""" + audio = np.zeros(1600, dtype=np.float32) + result = peak_normalize(audio, db_level=-6.0) + np.testing.assert_array_equal(result, audio) + + def test_peak_reaches_target(self): + """After normalization, peak amplitude should be at target dB.""" + rng = np.random.default_rng(7) + audio = rng.uniform(-0.4, 0.4, size=16000).astype(np.float32) + + result = peak_normalize(audio, db_level=-6.0) + peak_db = 20 * np.log10(np.abs(result).max()) + np.testing.assert_allclose(peak_db, -6.0, atol=1e-4) diff --git a/vllm_omni/utils/audio.py b/vllm_omni/utils/audio.py index 490737bd530..cc25c179471 100644 --- a/vllm_omni/utils/audio.py +++ b/vllm_omni/utils/audio.py @@ -3,6 +3,7 @@ """Audio utility functions shared across models and entrypoints.""" +import numpy as np import torch from torchaudio.functional import melscale_fbanks @@ -43,3 +44,25 @@ def mel_filter_bank( mel_scale="slaney", norm="slaney", ).T + + +def peak_normalize( + audio: np.ndarray, + db_level: float = -6.0, +) -> np.ndarray: + """Normalize audio so peak amplitude reaches a target dB level. + + Drop-in replacement for ``sox.Transformer().norm(db_level=...)``. + + Args: + audio: Input waveform as a 1-D numpy array. + db_level: Target peak amplitude in dBFS. + + Returns: + Normalized waveform with the same dtype as *audio*. + """ + peak = np.abs(audio).max() + if peak == 0: + return audio + target = 10.0 ** (db_level / 20.0) + return audio * (target / peak) From f2ce9315c85f7cd45c3fc19c86a51531e0f4898e Mon Sep 17 00:00:00 2001 From: Nick Cao Date: Mon, 13 Apr 2026 12:18:09 -0400 Subject: [PATCH 2/4] [Refactor] Replace sox with peak_normalize in speech_vq Replace sox.Transformer().norm(db_level=-6) with peak_normalize, removing the last runtime usage of pysox from the codebase. Co-authored-by: Claude Signed-off-by: Nick Cao --- .../models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py index 9bb2f78c5c0..92cecbff107 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py +++ b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py @@ -17,14 +17,13 @@ from itertools import accumulate import onnxruntime -import sox import torch import torch.nn as nn import torch.nn.functional as F import torchaudio.compliance.kaldi as kaldi from torch import Tensor -from vllm_omni.utils.audio import mel_filter_bank +from vllm_omni.utils.audio import mel_filter_bank, peak_normalize from .core_vq import DistributedGroupResidualVectorQuantization from .whisper_encoder import Conv1d, ConvTranspose1d, WhisperEncoder @@ -153,9 +152,6 @@ def __init__(self, audio_codec_with_xvector): audio_codec_with_xvector, sess_options=option, providers=providers ) - self.tfm = sox.Transformer() - self.tfm.norm(db_level=-6) - self.mel_ext = MelSpectrogramFeatures( filter_length=1024, hop_length=160, @@ -183,8 +179,7 @@ def extract_code(self, audio): return norm_embedding.numpy(), ref_mel.permute(0, 2, 1).squeeze(0).numpy() def sox_norm(self, audio): - wav_norm = self.tfm.build_array(input_array=audio, sample_rate_in=16000) - return wav_norm + return peak_normalize(audio, db_level=-6) class WhisperEncoderVQ(WhisperEncoder): From e71cf363ee711c0081792b8871dbbd624d589706 Mon Sep 17 00:00:00 2001 From: Nick Cao Date: Mon, 13 Apr 2026 12:20:59 -0400 Subject: [PATCH 3/4] [Doc] Remove sox from installation documentation Update ROCm installation docs and Qwen3-TTS README to no longer list sox as a required dependency. Co-authored-by: Claude Signed-off-by: Nick Cao --- docs/getting_started/installation/gpu/rocm.inc.md | 2 +- docs/user_guide/examples/offline_inference/qwen3_tts.md | 4 ++-- examples/offline_inference/qwen3_tts/README.md | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 1a683d174f7..5dfea8d2ffe 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -26,7 +26,7 @@ uv pip install vllm-omni # Optional if want to run Qwen3 TTS uv pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm -uv pip install onnxruntime-rocm sox +uv pip install onnxruntime-rocm ``` # --8<-- [end:pre-built-wheels] diff --git a/docs/user_guide/examples/offline_inference/qwen3_tts.md b/docs/user_guide/examples/offline_inference/qwen3_tts.md index 19fea4132ce..4ece5219d7f 100644 --- a/docs/user_guide/examples/offline_inference/qwen3_tts.md +++ b/docs/user_guide/examples/offline_inference/qwen3_tts.md @@ -18,11 +18,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro ### ROCm Dependencies -You will need to install these two dependencies `onnxruntime-rocm` and `sox`. +You will need to install the dependency `onnxruntime-rocm`. ``` pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm -pip install onnxruntime-rocm sox +pip install onnxruntime-rocm ``` ## Quick Start diff --git a/examples/offline_inference/qwen3_tts/README.md b/examples/offline_inference/qwen3_tts/README.md index bf59dc9ba49..c38a2b462d1 100644 --- a/examples/offline_inference/qwen3_tts/README.md +++ b/examples/offline_inference/qwen3_tts/README.md @@ -15,11 +15,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro ### ROCm Dependencies -You will need to install these two dependencies `onnxruntime-rocm` and `sox`. +You will need to install the dependency `onnxruntime-rocm`. ``` pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm -pip install onnxruntime-rocm sox +pip install onnxruntime-rocm ``` ## Quick Start From c03f27ace77d1833d1317d9f48f81fe55216b677 Mon Sep 17 00:00:00 2001 From: Nick Cao Date: Mon, 13 Apr 2026 12:21:17 -0400 Subject: [PATCH 4/4] [CI/Build] Remove sox from requirements and Dockerfiles Drop sox>=1.5.0 from requirements/common.txt and remove sox/libsox-fmt-all from CI, CUDA, and ROCm Dockerfiles. Co-authored-by: Claude Signed-off-by: Nick Cao --- docker/Dockerfile.ci | 2 +- docker/Dockerfile.cuda | 2 +- docker/Dockerfile.rocm | 2 +- requirements/common.txt | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 2a98de1b812..9cbf89d0b79 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -7,7 +7,7 @@ COPY . . # Install system dependencies RUN apt-get update && \ - apt-get install -y espeak-ng git sox libsox-fmt-all jq && \ + apt-get install -y espeak-ng git jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda index 6ed5b7d2773..28e10f4fb85 100644 --- a/docker/Dockerfile.cuda +++ b/docker/Dockerfile.cuda @@ -7,7 +7,7 @@ WORKDIR ${COMMON_WORKDIR} # Step 1: Setup - Install system dependencies RUN apt-get update && \ - apt-get install -y git sox libsox-fmt-all jq && \ + apt-get install -y git jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index b3447838926..a54aa3b7933 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -21,7 +21,7 @@ WORKDIR ${COMMON_WORKDIR} # Need to include ffmpeg because vllm rocm upstream docker image # does not include it. RUN apt-get update && \ - apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \ + apt-get install -y espeak-ng ffmpeg git jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/requirements/common.txt b/requirements/common.txt index 1fff584448d..1f44d343c62 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -10,7 +10,6 @@ tqdm>=4.66.0 torchsde>=0.2.6 openai-whisper>=20250625 imageio[ffmpeg]>=2.37.2 -sox>=1.5.0 x-transformers>=2.12.2 einops>=0.8.1 prettytable>=3.8.0