wenet-e2e · Mddct · Jan 18, 2024 · Jan 15, 2024 · Jan 15, 2024 · Jan 15, 2024
diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
@@ -40,11 +40,10 @@ jobs:
         run: |
           set -eux
           pip install -r requirements.txt
-          sudo apt update && sudo apt install -y ffmpeg
+          sudo apt update && sudo apt install -y ffmpeg  libsox-dev
       - name: Run Pytest
         run: |
           set -eux
           pytest --version
           PYTHONPATH="${PYTHONPATH:-}:$(pwd)" pytest -q
           if [ $? != 0 ]; then exit 1; fi
-
diff --git a/README.md b/README.md
@@ -58,8 +58,18 @@ git clone https://github.com/wenet-e2e/wenet.git
 ``` sh
 conda create -n wenet python=3.8
 conda activate wenet
+conda install conda-forge::sox
 pip install -r requirements.txt
 pre-commit install  # for clean and tidy code
+
+# If you encounter sox compatibility issues
+RuntimeError: set_buffer_size requires sox extension which is not available.
+# ubuntu
+sudo apt-get install sox libsox-dev
+# centos
+sudo yum install sox sox-devel
+# conda env
+conda install  conda-forge::sox
 ```
 
 **Build for deployment**
@@ -100,8 +110,8 @@ Please scan the personal QR code on the right, and the guy is responsible for in
 
 ``` bibtex
 @inproceedings{yao2021wenet,
-  title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
-  author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
+title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
+author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
   booktitle={Proc. Interspeech},
   year={2021},
   address={Brno, Czech Republic },

diff --git a/requirements.txt b/requirements.txt
@@ -13,8 +13,8 @@ flake8-pyi==20.5.0
 mccabe
 pycodestyle==2.6.0
 pyflakes==2.2.0
-torch==1.13.0
-torchaudio==0.13.0
+torch==2.1.2
+torchaudio==2.1.2
 tqdm
 deepspeed
 librosa

diff --git a/test/resources/dataset/aishell-BAC009S0724W0121.wav b/test/resources/dataset/aishell-BAC009S0724W0121.wav
@@ -0,0 +1 @@
+../aishell-BAC009S0724W0121.wav
diff --git a/test/resources/dataset/data.list b/test/resources/dataset/data.list
@@ -0,0 +1,2 @@
+{"key": "test/resources/dataset/aishell-BAC009S0724W0121", "wav": "test/resources/dataset/aishell-BAC009S0724W0121.wav", "txt": "广州市房地产中介协会分析"}
+{"key": "test/resources/dataset/librispeech-1995-1837-0001", "wav": "test/resources/dataset/librispeech-1995-1837-0001.wav", "txt": "IT WAS THE FIRST GREAT SORROW OF HIS LIFE IT WAS NOT SO MUCH THE LOSS OF THE COTTON ITSELF BUT THE FANTASY THE HOPES THE DREAMS BUILT AROUND IT"}
diff --git a/test/resources/dataset/data.shards.list b/test/resources/dataset/data.shards.list
@@ -0,0 +1 @@
+test/resources/dataset/shards/shards_000000000.tar
diff --git a/test/resources/dataset/librispeech-1995-1837-0001.wav b/test/resources/dataset/librispeech-1995-1837-0001.wav
@@ -0,0 +1 @@
+../librispeech-1995-1837-0001.wav
diff --git a/test/resources/dataset/shards/shards_000000000.tar b/test/resources/dataset/shards/shards_000000000.tar
diff --git a/test/resources/dataset/text b/test/resources/dataset/text
@@ -0,0 +1,2 @@
+test/resources/dataset/aishell-BAC009S0724W0121 广州市房地产中介协会分析
+test/resources/dataset/librispeech-1995-1837-0001 IT WAS THE FIRST GREAT SORROW OF HIS LIFE IT WAS NOT SO MUCH THE LOSS OF THE COTTON ITSELF BUT THE FANTASY THE HOPES THE DREAMS BUILT AROUND IT
diff --git a/test/resources/dataset/wav.scp b/test/resources/dataset/wav.scp
@@ -0,0 +1,2 @@
+test/resources/dataset/aishell-BAC009S0724W0121 test/resources/dataset/aishell-BAC009S0724W0121.wav
+test/resources/dataset/librispeech-1995-1837-0001 test/resources/dataset/librispeech-1995-1837-0001.wav
diff --git a/test/tools/test_make_shard.py b/test/tools/test_make_shard.py
@@ -0,0 +1,31 @@
+import glob
+import io
+import torch
+from torchaudio._extension import torchaudio
+
+
+def test_save_load_consistently():
+    wav_paths = glob.glob("test/resources/*.wav")
+    for wav_path in wav_paths:
+        wav, sr = torchaudio.load(wav_path)
+        with io.BytesIO() as f:
+            wav = torchaudio.transforms.Resample(sr, sr)(wav)
+            wav_short = (wav * (1 << 15))
+            wav_short = wav_short.to(torch.int16)
+            torchaudio.save(f, wav_short, sr, format="wav", bits_per_sample=16)
+            f.seek(0)
+            b = f.read()
+
+        with io.BytesIO(b) as f:
+            new_wav, new_sr = torchaudio.load(f)
+            assert new_sr == sr
+            torch.allclose(new_wav, wav)
+
+
+def test_sox_set_buffer():
+    torchaudio.utils.sox_utils.set_buffer_size(16500)
+
+
+def test_make_shards():
+    # TODO(MDdct): add make shards
+    pass
diff --git a/test/wenet/dataset/test_processor.py b/test/wenet/dataset/test_processor.py
@@ -1,4 +1,7 @@
+import json
 import pytest
+from torchaudio._extension import torchaudio
+from whisper import torch
 
 from wenet.dataset import processor
 from wenet.utils.init_tokenizer import init_tokenizer
@@ -154,3 +157,47 @@ def test_tokenize(symbol_table_path):
         assert (all(h == r for h, r in zip(hyp["tokens"], ref["tokens"])))
         assert (len(hyp["label"]) == len(ref["label"]))
         assert (all(h == r for h, r in zip(hyp["label"], ref["label"])))
+
+
+def _get_records(raw_file_path):
+    records = []
+    with open(raw_file_path, 'r') as f:
+        for line in f:
+            json_line = line.strip('\n')
+            records.append({'src': json_line})
+    return records
+
+
+@pytest.mark.parametrize("raw_file_path", ["test/resources/dataset/data.list"])
+def test_parse_raw(raw_file_path):
+
+    records = _get_records(raw_file_path)
+    raw_processor = processor.parse_raw(records)
+    for (ori, processed) in zip(records, raw_processor):
+        ori = json.loads(ori['src'])
+        assert ori['key'] == processed['key']
+        ori_waveform, ori_sample_rate = torchaudio.load(ori['wav'])
+        processed_waveform = processed['wav']
+        assert torch.allclose(ori_waveform, processed_waveform)
+        assert ori_sample_rate == processed['sample_rate']
+        assert processed['txt'] == ori['txt']
+
+
+@pytest.mark.parametrize(
+    "shard_path", ["test/resources/dataset/shards/shards_000000000.tar"])
+def test_tar_file_and_group(shard_path):
+    # TODO: paramemter
+    raw_file_path = 'test/resources/dataset/data.list'
+    records = _get_records(raw_file_path)
+
+    tar_iter = iter([{'stream': open(shard_path, 'rb')}])
+    tar_processor = processor.tar_file_and_group(tar_iter)
+    for (ori, processed) in zip(records, tar_processor):
+        print(processed)
+        ori = json.loads(ori['src'])
+        assert ori['key'] == processed['key']
+        ori_waveform, ori_sample_rate = torchaudio.load(ori['wav'])
+        processed_waveform = processed['wav']
+        assert torch.allclose(ori_waveform, processed_waveform)
+        assert ori_sample_rate == processed['sample_rate']
+        assert processed['txt'] == ori['txt']
diff --git a/test/wenet/whisper/test_whisper.py b/test/wenet/whisper/test_whisper.py
@@ -475,7 +475,7 @@ def test_model(model, audio_path):
         np.testing.assert_allclose(whisper_layer["value"].numpy(),
                                    wenet_layer["value"].numpy(),
                                    rtol=1e-7,
-                                   atol=6e-3)
+                                   atol=8e-3)
     np.testing.assert_allclose(whisper_encoder_out.numpy(),
                                wenet_encoder_out.numpy(),
                                rtol=1e-7,

diff --git a/tools/make_shard_list.py b/tools/make_shard_list.py
@@ -24,7 +24,6 @@
 
 import torch
 import torchaudio
-import torchaudio.backend.sox_io_backend as sox
 
 AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
 
@@ -52,51 +51,35 @@ def write_tar_file(data_list,
             if no_segments:
                 # read & resample
                 ts = time.time()
-                audio, sample_rate = sox.load(wav, normalize=False)
-                if sample_rate != resample:
-                    audio = torchaudio.transforms.Resample(
-                        sample_rate, resample)(audio.float())
-                    audio = audio.to(torch.int16)
+                audio, sample_rate = torchaudio.load(wav)
+                audio = torchaudio.transforms.Resample(sample_rate,
+                                                       resample)(audio)
                 read_time += (time.time() - ts)
-                # change format to wav
-                ts = time.time()
-                f = io.BytesIO()
-                sox.save(f, audio, resample, format="wav", bits_per_sample=16)
-                suffix = "wav"
-                f.seek(0)
-                data = f.read()
-                save_time += (time.time() - ts)
             else:
                 if wav != prev_wav:
                     ts = time.time()
-                    waveforms, sample_rate = sox.load(wav, normalize=False)
+                    waveforms, sample_rate = torchaudio.load(wav)
                     read_time += (time.time() - ts)
                     prev_wav = wav
                 start = int(start * sample_rate)
                 end = int(end * sample_rate)
                 audio = waveforms[:1, start:end]
+                audio = torchaudio.transforms.Resample(sample_rate,
+                                                       resample)(audio)
 
-                # resample
-                if sample_rate != resample:
-                    if not audio.is_floating_point():
-                        # normalize the audio before resample
-                        # because resample can't process int audio
-                        audio = audio / (1 << 15)
-                        audio = torchaudio.transforms.Resample(
-                            sample_rate, resample)(audio)
-                        audio = (audio * (1 << 15)).short()
-                    else:
-                        audio = torchaudio.transforms.Resample(
-                            sample_rate, resample)(audio)
-
-                ts = time.time()
-                f = io.BytesIO()
-                sox.save(f, audio, resample, format="wav", bits_per_sample=16)
-                # Save to wav for segments file
+            audio = (audio * (1 << 15))
+            audio = audio.to(torch.int16)
+            ts = time.time()
+            with io.BytesIO() as f:
+                torchaudio.save(f,
+                                audio,
+                                resample,
+                                format="wav",
+                                bits_per_sample=16)
                 suffix = "wav"
                 f.seek(0)
                 data = f.read()
-                save_time += (time.time() - ts)
+            save_time += (time.time() - ts)
 
             assert isinstance(txt, str)
             ts = time.time()

diff --git a/wenet/dataset/processor.py b/wenet/dataset/processor.py
@@ -76,7 +76,7 @@ def tar_file_and_group(data):
     """
     for sample in data:
         assert 'stream' in sample
-        stream = tarfile.open(fileobj=sample['stream'], mode="r|*")
+        stream = tarfile.open(fileobj=sample['stream'], mode="r:*")
         prev_prefix = None
         example = {}
         valid = True
@@ -136,14 +136,13 @@ def parse_raw(data):
         try:
             if 'start' in obj:
                 assert 'end' in obj
-                sample_rate = torchaudio.backend.sox_io_backend.info(
-                    wav_file).sample_rate
+                sample_rate = torchaudio.info(wav_file).sample_rate
                 start_frame = int(obj['start'] * sample_rate)
                 end_frame = int(obj['end'] * sample_rate)
-                waveform, _ = torchaudio.backend.sox_io_backend.load(
-                    filepath=wav_file,
-                    num_frames=end_frame - start_frame,
-                    frame_offset=start_frame)
+                waveform, _ = torchaudio.load(filepath=wav_file,
+                                              num_frames=end_frame -
+                                              start_frame,
+                                              frame_offset=start_frame)
             else:
                 waveform, sample_rate = torchaudio.load(wav_file)
             example = copy.deepcopy(obj)  # copy and keep all the fields

diff --git a/wenet/dataset/wav_distortion.py b/wenet/dataset/wav_distortion.py
@@ -19,8 +19,6 @@
 import torchaudio
 import torch
 
-torchaudio.set_audio_backend("sox_io")
-
 
 def db2amp(db):
     return pow(10, db / 20)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"key": "test/resources/dataset/aishell-BAC009S0724W0121", "wav": "test/resources/dataset/aishell-BAC009S0724W0121.wav", "txt": "广州市房地产中介协会分析"}
		{"key": "test/resources/dataset/librispeech-1995-1837-0001", "wav": "test/resources/dataset/librispeech-1995-1837-0001.wav", "txt": "IT WAS THE FIRST GREAT SORROW OF HIS LIFE IT WAS NOT SO MUCH THE LOSS OF THE COTTON ITSELF BUT THE FANTASY THE HOPES THE DREAMS BUILT AROUND IT"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		test/resources/dataset/shards/shards_000000000.tar
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		test/resources/dataset/aishell-BAC009S0724W0121 广州市房地产中介协会分析
		test/resources/dataset/librispeech-1995-1837-0001 IT WAS THE FIRST GREAT SORROW OF HIS LIFE IT WAS NOT SO MUCH THE LOSS OF THE COTTON ITSELF BUT THE FANTASY THE HOPES THE DREAMS BUILT AROUND IT