vllm-project · gcanlin · Apr 17, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
@@ -7,7 +7,7 @@ COPY . .
 
 # Install system dependencies
 RUN apt-get update && \
-    apt-get install -y espeak-ng git sox libsox-fmt-all jq && \
+    apt-get install -y espeak-ng git jq && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 

@@ -7,7 +7,7 @@ WORKDIR ${COMMON_WORKDIR}
 
 # Step 1: Setup - Install system dependencies
 RUN apt-get update && \
-    apt-get install -y git sox libsox-fmt-all jq && \
+    apt-get install -y git jq && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 

@@ -21,7 +21,7 @@ WORKDIR ${COMMON_WORKDIR}
 # Need to include ffmpeg because vllm rocm upstream docker image
 # does not include it.
 RUN apt-get update && \
-    apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \
+    apt-get install -y espeak-ng ffmpeg git jq && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 

@@ -26,7 +26,7 @@ uv pip install vllm-omni
 
 # Optional if want to run Qwen3 TTS
 uv pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
-uv pip install onnxruntime-rocm sox
+uv pip install onnxruntime-rocm
 ```
 
 # --8<-- [end:pre-built-wheels]

@@ -18,11 +18,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro
 
 ### ROCm Dependencies
 
-You will need to install these two dependencies `onnxruntime-rocm` and `sox`.
+You will need to install the dependency `onnxruntime-rocm`.
 
 ```
 pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
-pip install onnxruntime-rocm sox
+pip install onnxruntime-rocm
 ```
 
 ## Quick Start

@@ -15,11 +15,11 @@ Please refer to the [stage configuration documentation](https://docs.vllm.ai/pro
 
 ### ROCm Dependencies
 
-You will need to install these two dependencies `onnxruntime-rocm` and `sox`.
+You will need to install the dependency `onnxruntime-rocm`.
 
 ```
 pip uninstall onnxruntime # should be removed before we can install onnxruntime-rocm
-pip install onnxruntime-rocm sox
+pip install onnxruntime-rocm
 ```
 
 ## Quick Start

@@ -10,7 +10,6 @@ tqdm>=4.66.0
 torchsde>=0.2.6
 openai-whisper>=20250625
 imageio[ffmpeg]>=2.37.2
-sox>=1.5.0
 x-transformers>=2.12.2
 einops>=0.8.1
 prettytable>=3.8.0

@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-"""Unit tests for vllm_omni.utils.audio.mel_filter_bank."""
+"""Unit tests for vllm_omni.utils.audio."""
 
+import numpy as np
 import pytest
 import torch
 
-from vllm_omni.utils.audio import mel_filter_bank
+from vllm_omni.utils.audio import mel_filter_bank, peak_normalize
 
 # Parameter combinations used across the codebase.
 _PARAM_SETS = [
@@ -59,3 +60,20 @@ def test_higher_fmax_extends_coverage(self):
         last_nonzero_low = (fb_low.sum(dim=0) > 0).nonzero()[-1].item()
         last_nonzero_high = (fb_high.sum(dim=0) > 0).nonzero()[-1].item()
         assert last_nonzero_high > last_nonzero_low
+
+
+class TestPeakNormalize:
+    def test_silence_unchanged(self):
+        """All-zero input should remain all-zero."""
+        audio = np.zeros(1600, dtype=np.float32)
+        result = peak_normalize(audio, db_level=-6.0)
+        np.testing.assert_array_equal(result, audio)
+
+    def test_peak_reaches_target(self):
+        """After normalization, peak amplitude should be at target dB."""
+        rng = np.random.default_rng(7)
+        audio = rng.uniform(-0.4, 0.4, size=16000).astype(np.float32)
+
+        result = peak_normalize(audio, db_level=-6.0)
+        peak_db = 20 * np.log10(np.abs(result).max())
+        np.testing.assert_allclose(peak_db, -6.0, atol=1e-4)
@@ -17,14 +17,13 @@
 from itertools import accumulate
 
 import onnxruntime
-import sox
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio.compliance.kaldi as kaldi
 from torch import Tensor
 
-from vllm_omni.utils.audio import mel_filter_bank
+from vllm_omni.utils.audio import mel_filter_bank, peak_normalize
 
 from .core_vq import DistributedGroupResidualVectorQuantization
 from .whisper_encoder import Conv1d, ConvTranspose1d, WhisperEncoder
@@ -153,9 +152,6 @@ def __init__(self, audio_codec_with_xvector):
             audio_codec_with_xvector, sess_options=option, providers=providers
         )
 
-        self.tfm = sox.Transformer()
-        self.tfm.norm(db_level=-6)
-
         self.mel_ext = MelSpectrogramFeatures(
             filter_length=1024,
             hop_length=160,
@@ -183,8 +179,7 @@ def extract_code(self, audio):
         return norm_embedding.numpy(), ref_mel.permute(0, 2, 1).squeeze(0).numpy()
 
     def sox_norm(self, audio):
-        wav_norm = self.tfm.build_array(input_array=audio, sample_rate_in=16000)
-        return wav_norm
+        return peak_normalize(audio, db_level=-6)
 
 
 class WhisperEncoderVQ(WhisperEncoder):

@@ -3,6 +3,7 @@
 
 """Audio utility functions shared across models and entrypoints."""
 
+import numpy as np
 import torch
 from torchaudio.functional import melscale_fbanks
 
@@ -43,3 +44,25 @@ def mel_filter_bank(
         mel_scale="slaney",
         norm="slaney",
     ).T
+
+
+def peak_normalize(
+    audio: np.ndarray,
+    db_level: float = -6.0,
+) -> np.ndarray:
+    """Normalize audio so peak amplitude reaches a target dB level.
+
+    Drop-in replacement for ``sox.Transformer().norm(db_level=...)``.
+
+    Args:
+        audio: Input waveform as a 1-D numpy array.
+        db_level: Target peak amplitude in dBFS.
+
+    Returns:
+        Normalized waveform with the same dtype as *audio*.
+    """
+    peak = np.abs(audio).max()
+    if peak == 0:
+        return audio
+    target = 10.0 ** (db_level / 20.0)
+    return audio * (target / peak)