From 50d22a4b77f40fe9eaf5028b19a777a8c8361420 Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Mon, 13 Apr 2026 12:12:28 -0400
Subject: [PATCH 1/4] [Feat] Add peak_normalize utility to
 vllm_omni.utils.audio

Drop-in replacement for sox.Transformer().norm(db_level=...).
Scales audio so peak amplitude reaches a target dB level.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 tests/utils/test_audio.py | 22 ++++++++++++++++++++--
 vllm_omni/utils/audio.py  | 23 +++++++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/tests/utils/test_audio.py b/tests/utils/test_audio.py
index cfbd2501b25..0e483e64685 100644
--- a/tests/utils/test_audio.py
+++ b/tests/utils/test_audio.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-"""Unit tests for vllm_omni.utils.audio.mel_filter_bank."""
+"""Unit tests for vllm_omni.utils.audio."""
 
+import numpy as np
 import pytest
 import torch
 
-from vllm_omni.utils.audio import mel_filter_bank
+from vllm_omni.utils.audio import mel_filter_bank, peak_normalize
 
 # Parameter combinations used across the codebase.
 _PARAM_SETS = [
@@ -59,3 +60,20 @@ def test_higher_fmax_extends_coverage(self):
         last_nonzero_low = (fb_low.sum(dim=0) > 0).nonzero()[-1].item()
         last_nonzero_high = (fb_high.sum(dim=0) > 0).nonzero()[-1].item()
         assert last_nonzero_high > last_nonzero_low
+
+
+class TestPeakNormalize:
+    def test_silence_unchanged(self):
+        """All-zero input should remain all-zero."""
+        audio = np.zeros(1600, dtype=np.float32)
+        result = peak_normalize(audio, db_level=-6.0)
+        np.testing.assert_array_equal(result, audio)
+
+    def test_peak_reaches_target(self):
+        """After normalization, peak amplitude should be at target dB."""
+        rng = np.random.default_rng(7)
+        audio = rng.uniform(-0.4, 0.4, size=16000).astype(np.float32)
+
+        result = peak_normalize(audio, db_level=-6.0)
+        peak_db = 20 * np.log10(np.abs(result).max())
+        np.testing.assert_allclose(peak_db, -6.0, atol=1e-4)
diff --git a/vllm_omni/utils/audio.py b/vllm_omni/utils/audio.py
index 490737bd530..cc25c179471 100644
--- a/vllm_omni/utils/audio.py
+++ b/vllm_omni/utils/audio.py
@@ -3,6 +3,7 @@
 
 """Audio utility functions shared across models and entrypoints."""
 
+import numpy as np
 import torch
 from torchaudio.functional import melscale_fbanks
 
@@ -43,3 +44,25 @@ def mel_filter_bank(
         mel_scale="slaney",
         norm="slaney",
     ).T
+
+
+def peak_normalize(
+    audio: np.ndarray,
+    db_level: float = -6.0,
+) -> np.ndarray:
+    """Normalize audio so peak amplitude reaches a target dB level.
+
+    Drop-in replacement for ``sox.Transformer().norm(db_level=...)``.
+
+    Args:
+        audio: Input waveform as a 1-D numpy array.
+        db_level: Target peak amplitude in dBFS.
+
+    Returns:
+        Normalized waveform with the same dtype as *audio*.
+    """
+    peak = np.abs(audio).max()
+    if peak == 0:
+        return audio
+    target = 10.0 ** (db_level / 20.0)
+    return audio * (target / peak)

From f2ce9315c85f7cd45c3fc19c86a51531e0f4898e Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Mon, 13 Apr 2026 12:18:09 -0400
Subject: [PATCH 2/4] [Refactor] Replace sox with peak_normalize in speech_vq

Replace sox.Transformer().norm(db_level=-6) with peak_normalize,
removing the last runtime usage of pysox from the codebase.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 .../models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py      | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py
index 9bb2f78c5c0..92cecbff107 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py
@@ -17,14 +17,13 @@
 from itertools import accumulate
 
 import onnxruntime
-import sox
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio.compliance.kaldi as kaldi
 from torch import Tensor
 
-from vllm_omni.utils.audio import mel_filter_bank
+from vllm_omni.utils.audio import mel_filter_bank, peak_normalize
 
 from .core_vq import DistributedGroupResidualVectorQuantization
 from .whisper_encoder import Conv1d, ConvTranspose1d, WhisperEncoder
@@ -153,9 +152,6 @@ def __init__(self, audio_codec_with_xvector):
             audio_codec_with_xvector, sess_options=option, providers=providers
         )
 
-        self.tfm = sox.Transformer()
-        self.tfm.norm(db_level=-6)
-
         self.mel_ext = MelSpectrogramFeatures(
             filter_length=1024,
             hop_length=160,
@@ -183,8 +179,7 @@ def extract_code(self, audio):
         return norm_embedding.numpy(), ref_mel.permute(0, 2, 1).squeeze(0).numpy()
 
     def sox_norm(self, audio):
-        wav_norm = self.tfm.build_array(input_array=audio, sample_rate_in=16000)
-        return wav_norm
+        return peak_normalize(audio, db_level=-6)
 
 
 class WhisperEncoderVQ(WhisperEncoder):

From e71cf363ee711c0081792b8871dbbd624d589706 Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Mon, 13 Apr 2026 12:20:59 -0400
Subject: [PATCH 3/4] [Doc] Remove sox from installation documentation

Update ROCm installation docs and Qwen3-TTS README to no longer
list sox as a required dependency.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 docs/getting_started/installation/gpu/rocm.inc.md       | 2 +-
 docs/user_guide/examples/offline_inference/qwen3_tts.md | 4 ++--
 examples/offline_inference/qwen3_tts/README.md          | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 1a683d174f7..5dfea8d2ffe 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -26,7 +26,7 @@ uv pip install vllm-omni
 
 # Optional if want to run Qwen3 TTS
 uv pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
-uv pip install onnxruntime-rocm sox
+uv pip install onnxruntime-rocm
 ```
 
 # --8<-- [end:pre-built-wheels]
diff --git a/docs/user_guide/examples/offline_inference/qwen3_tts.md b/docs/user_guide/examples/offline_inference/qwen3_tts.md
index 19fea4132ce..4ece5219d7f 100644
--- a/docs/user_guide/examples/offline_inference/qwen3_tts.md
+++ b/docs/user_guide/examples/offline_inference/qwen3_tts.md
@@ -18,11 +18,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro
 
 ### ROCm Dependencies
 
-You will need to install these two dependencies `onnxruntime-rocm` and `sox`.
+You will need to install the dependency `onnxruntime-rocm`.
 
 ```
 pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
-pip install onnxruntime-rocm sox
+pip install onnxruntime-rocm
 ```
 
 ## Quick Start
diff --git a/examples/offline_inference/qwen3_tts/README.md b/examples/offline_inference/qwen3_tts/README.md
index bf59dc9ba49..c38a2b462d1 100644
--- a/examples/offline_inference/qwen3_tts/README.md
+++ b/examples/offline_inference/qwen3_tts/README.md
@@ -15,11 +15,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro
 
 ### ROCm Dependencies
 
-You will need to install these two dependencies `onnxruntime-rocm` and `sox`.
+You will need to install the dependency `onnxruntime-rocm`.
 
 ```
 pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
-pip install onnxruntime-rocm sox
+pip install onnxruntime-rocm
 ```
 
 ## Quick Start

From c03f27ace77d1833d1317d9f48f81fe55216b677 Mon Sep 17 00:00:00 2001
From: Nick Cao <ncao@redhat.com>
Date: Mon, 13 Apr 2026 12:21:17 -0400
Subject: [PATCH 4/4] [CI/Build] Remove sox from requirements and Dockerfiles

Drop sox>=1.5.0 from requirements/common.txt and remove
sox/libsox-fmt-all from CI, CUDA, and ROCm Dockerfiles.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Nick Cao <ncao@redhat.com>
---
 docker/Dockerfile.ci    | 2 +-
 docker/Dockerfile.cuda  | 2 +-
 docker/Dockerfile.rocm  | 2 +-
 requirements/common.txt | 1 -
 4 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index 2a98de1b812..9cbf89d0b79 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -7,7 +7,7 @@ COPY . .
 
 # Install system dependencies
 RUN apt-get update && \
-    apt-get install -y espeak-ng git sox libsox-fmt-all jq && \
+    apt-get install -y espeak-ng git jq && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index 6ed5b7d2773..28e10f4fb85 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -7,7 +7,7 @@ WORKDIR ${COMMON_WORKDIR}
 
 # Step 1: Setup - Install system dependencies
 RUN apt-get update && \
-    apt-get install -y git sox libsox-fmt-all jq && \
+    apt-get install -y git jq && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index b3447838926..a54aa3b7933 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -21,7 +21,7 @@ WORKDIR ${COMMON_WORKDIR}
 # Need to include ffmpeg because vllm rocm upstream docker image
 # does not include it.
 RUN apt-get update && \
-    apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \
+    apt-get install -y espeak-ng ffmpeg git jq && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/requirements/common.txt b/requirements/common.txt
index 1fff584448d..1f44d343c62 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -10,7 +10,6 @@ tqdm>=4.66.0
 torchsde>=0.2.6
 openai-whisper>=20250625
 imageio[ffmpeg]>=2.37.2
-sox>=1.5.0
 x-transformers>=2.12.2
 einops>=0.8.1
 prettytable>=3.8.0