Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bump version to 2.xx #2301

Merged
merged 8 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,10 @@ jobs:
run: |
set -eux
pip install -r requirements.txt
sudo apt update && sudo apt install -y ffmpeg
sudo apt update && sudo apt install -y ffmpeg libsox-dev
Mddct marked this conversation as resolved.
Show resolved Hide resolved
- name: Run Pytest
run: |
set -eux
pytest --version
PYTHONPATH="${PYTHONPATH:-}:$(pwd)" pytest -q
if [ $? != 0 ]; then exit 1; fi

14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,18 @@ git clone https://github.com/wenet-e2e/wenet.git
``` sh
conda create -n wenet python=3.8
conda activate wenet
conda install conda-forge::sox
pip install -r requirements.txt
pre-commit install # for clean and tidy code

# If you encounter sox compatibility issues
RuntimeError: set_buffer_size requires sox extension which is not available.
# ubuntu
sudo apt-get install sox libsox-dev
# centos
sudo yum install sox sox-devel
# conda env
conda install conda-forge::sox
```

**Build for deployment**
Expand Down Expand Up @@ -100,8 +110,8 @@ Please scan the personal QR code on the right, and the guy is responsible for in

``` bibtex
@inproceedings{yao2021wenet,
title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
booktitle={Proc. Interspeech},
year={2021},
address={Brno, Czech Republic },
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ flake8-pyi==20.5.0
mccabe
pycodestyle==2.6.0
pyflakes==2.2.0
torch==1.13.0
torchaudio==0.13.0
torch==2.1.2
torchaudio==2.1.2
tqdm
deepspeed
librosa
Expand Down
1 change: 1 addition & 0 deletions test/resources/dataset/aishell-BAC009S0724W0121.wav
2 changes: 2 additions & 0 deletions test/resources/dataset/data.list
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"key": "test/resources/dataset/aishell-BAC009S0724W0121", "wav": "test/resources/dataset/aishell-BAC009S0724W0121.wav", "txt": "广州市房地产中介协会分析"}
{"key": "test/resources/dataset/librispeech-1995-1837-0001", "wav": "test/resources/dataset/librispeech-1995-1837-0001.wav", "txt": "IT WAS THE FIRST GREAT SORROW OF HIS LIFE IT WAS NOT SO MUCH THE LOSS OF THE COTTON ITSELF BUT THE FANTASY THE HOPES THE DREAMS BUILT AROUND IT"}
1 change: 1 addition & 0 deletions test/resources/dataset/data.shards.list
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test/resources/dataset/shards/shards_000000000.tar
1 change: 1 addition & 0 deletions test/resources/dataset/librispeech-1995-1837-0001.wav
Binary file not shown.
2 changes: 2 additions & 0 deletions test/resources/dataset/text
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
test/resources/dataset/aishell-BAC009S0724W0121 广州市房地产中介协会分析
test/resources/dataset/librispeech-1995-1837-0001 IT WAS THE FIRST GREAT SORROW OF HIS LIFE IT WAS NOT SO MUCH THE LOSS OF THE COTTON ITSELF BUT THE FANTASY THE HOPES THE DREAMS BUILT AROUND IT
2 changes: 2 additions & 0 deletions test/resources/dataset/wav.scp
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
test/resources/dataset/aishell-BAC009S0724W0121 test/resources/dataset/aishell-BAC009S0724W0121.wav
test/resources/dataset/librispeech-1995-1837-0001 test/resources/dataset/librispeech-1995-1837-0001.wav
31 changes: 31 additions & 0 deletions test/tools/test_make_shard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import glob
import io
import torch
from torchaudio._extension import torchaudio


def test_save_load_consistently():
wav_paths = glob.glob("test/resources/*.wav")
for wav_path in wav_paths:
wav, sr = torchaudio.load(wav_path)
with io.BytesIO() as f:
wav = torchaudio.transforms.Resample(sr, sr)(wav)
wav_short = (wav * (1 << 15))
wav_short = wav_short.to(torch.int16)
torchaudio.save(f, wav_short, sr, format="wav", bits_per_sample=16)
f.seek(0)
b = f.read()

with io.BytesIO(b) as f:
new_wav, new_sr = torchaudio.load(f)
assert new_sr == sr
torch.allclose(new_wav, wav)


def test_sox_set_buffer():
torchaudio.utils.sox_utils.set_buffer_size(16500)


def test_make_shards():
# TODO(MDdct): add make shards
pass
47 changes: 47 additions & 0 deletions test/wenet/dataset/test_processor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import json
import pytest
from torchaudio._extension import torchaudio
from whisper import torch

from wenet.dataset import processor
from wenet.utils.init_tokenizer import init_tokenizer
Expand Down Expand Up @@ -154,3 +157,47 @@ def test_tokenize(symbol_table_path):
assert (all(h == r for h, r in zip(hyp["tokens"], ref["tokens"])))
assert (len(hyp["label"]) == len(ref["label"]))
assert (all(h == r for h, r in zip(hyp["label"], ref["label"])))


def _get_records(raw_file_path):
records = []
with open(raw_file_path, 'r') as f:
for line in f:
json_line = line.strip('\n')
records.append({'src': json_line})
return records


@pytest.mark.parametrize("raw_file_path", ["test/resources/dataset/data.list"])
def test_parse_raw(raw_file_path):

records = _get_records(raw_file_path)
raw_processor = processor.parse_raw(records)
for (ori, processed) in zip(records, raw_processor):
ori = json.loads(ori['src'])
assert ori['key'] == processed['key']
ori_waveform, ori_sample_rate = torchaudio.load(ori['wav'])
processed_waveform = processed['wav']
assert torch.allclose(ori_waveform, processed_waveform)
assert ori_sample_rate == processed['sample_rate']
assert processed['txt'] == ori['txt']


@pytest.mark.parametrize(
"shard_path", ["test/resources/dataset/shards/shards_000000000.tar"])
def test_tar_file_and_group(shard_path):
# TODO: paramemter
raw_file_path = 'test/resources/dataset/data.list'
records = _get_records(raw_file_path)

tar_iter = iter([{'stream': open(shard_path, 'rb')}])
tar_processor = processor.tar_file_and_group(tar_iter)
for (ori, processed) in zip(records, tar_processor):
print(processed)
ori = json.loads(ori['src'])
assert ori['key'] == processed['key']
ori_waveform, ori_sample_rate = torchaudio.load(ori['wav'])
processed_waveform = processed['wav']
assert torch.allclose(ori_waveform, processed_waveform)
assert ori_sample_rate == processed['sample_rate']
assert processed['txt'] == ori['txt']
2 changes: 1 addition & 1 deletion test/wenet/whisper/test_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ def test_model(model, audio_path):
np.testing.assert_allclose(whisper_layer["value"].numpy(),
wenet_layer["value"].numpy(),
rtol=1e-7,
atol=6e-3)
atol=8e-3)
np.testing.assert_allclose(whisper_encoder_out.numpy(),
wenet_encoder_out.numpy(),
rtol=1e-7,
Expand Down
49 changes: 16 additions & 33 deletions tools/make_shard_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

import torch
import torchaudio
import torchaudio.backend.sox_io_backend as sox

AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])

Expand Down Expand Up @@ -52,51 +51,35 @@ def write_tar_file(data_list,
if no_segments:
# read & resample
ts = time.time()
audio, sample_rate = sox.load(wav, normalize=False)
if sample_rate != resample:
audio = torchaudio.transforms.Resample(
sample_rate, resample)(audio.float())
audio = audio.to(torch.int16)
audio, sample_rate = torchaudio.load(wav)
audio = torchaudio.transforms.Resample(sample_rate,
resample)(audio)
read_time += (time.time() - ts)
# change format to wav
ts = time.time()
f = io.BytesIO()
sox.save(f, audio, resample, format="wav", bits_per_sample=16)
suffix = "wav"
f.seek(0)
data = f.read()
save_time += (time.time() - ts)
else:
if wav != prev_wav:
ts = time.time()
waveforms, sample_rate = sox.load(wav, normalize=False)
waveforms, sample_rate = torchaudio.load(wav)
read_time += (time.time() - ts)
prev_wav = wav
start = int(start * sample_rate)
end = int(end * sample_rate)
audio = waveforms[:1, start:end]
audio = torchaudio.transforms.Resample(sample_rate,
resample)(audio)

# resample
if sample_rate != resample:
if not audio.is_floating_point():
# normalize the audio before resample
# because resample can't process int audio
audio = audio / (1 << 15)
audio = torchaudio.transforms.Resample(
sample_rate, resample)(audio)
audio = (audio * (1 << 15)).short()
else:
audio = torchaudio.transforms.Resample(
sample_rate, resample)(audio)

ts = time.time()
f = io.BytesIO()
sox.save(f, audio, resample, format="wav", bits_per_sample=16)
# Save to wav for segments file
audio = (audio * (1 << 15))
audio = audio.to(torch.int16)
ts = time.time()
with io.BytesIO() as f:
torchaudio.save(f,
audio,
resample,
format="wav",
bits_per_sample=16)
suffix = "wav"
f.seek(0)
data = f.read()
save_time += (time.time() - ts)
save_time += (time.time() - ts)

assert isinstance(txt, str)
ts = time.time()
Expand Down
13 changes: 6 additions & 7 deletions wenet/dataset/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def tar_file_and_group(data):
"""
for sample in data:
assert 'stream' in sample
stream = tarfile.open(fileobj=sample['stream'], mode="r|*")
stream = tarfile.open(fileobj=sample['stream'], mode="r:*")
prev_prefix = None
example = {}
valid = True
Expand Down Expand Up @@ -136,14 +136,13 @@ def parse_raw(data):
try:
if 'start' in obj:
assert 'end' in obj
sample_rate = torchaudio.backend.sox_io_backend.info(
wav_file).sample_rate
sample_rate = torchaudio.info(wav_file).sample_rate
start_frame = int(obj['start'] * sample_rate)
end_frame = int(obj['end'] * sample_rate)
waveform, _ = torchaudio.backend.sox_io_backend.load(
filepath=wav_file,
num_frames=end_frame - start_frame,
frame_offset=start_frame)
waveform, _ = torchaudio.load(filepath=wav_file,
num_frames=end_frame -
start_frame,
frame_offset=start_frame)
else:
waveform, sample_rate = torchaudio.load(wav_file)
example = copy.deepcopy(obj) # copy and keep all the fields
Expand Down
2 changes: 0 additions & 2 deletions wenet/dataset/wav_distortion.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
import torchaudio
import torch

torchaudio.set_audio_backend("sox_io")


def db2amp(db):
return pow(10, db / 20)
Expand Down
Loading