diff --git a/wenet/dataset/processor.py b/wenet/dataset/processor.py index 68e500e45..a769eba8e 100644 --- a/wenet/dataset/processor.py +++ b/wenet/dataset/processor.py @@ -27,7 +27,7 @@ from torch.nn.utils.rnn import pad_sequence from wenet.text.base_tokenizer import BaseTokenizer -# torchaudio.utils.sox_utils.set_buffer_size(16500) +torchaudio.utils.sox_utils.set_buffer_size(16500) AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) diff --git a/wenet/utils/common.py b/wenet/utils/common.py index 7384bad57..5da1fb341 100644 --- a/wenet/utils/common.py +++ b/wenet/utils/common.py @@ -20,6 +20,9 @@ import torch from torch.nn.utils.rnn import pad_sequence +from whisper.tokenizer import LANGUAGES as WhiserLanguages + +WHISPER_LANGS = tuple(WhiserLanguages.keys()) IGNORE_ID = -1 @@ -173,8 +176,6 @@ def add_whisper_tokens( ys_out (torch.Tensor) : (B, Lmax + ?) """ - from whisper.tokenizer import LANGUAGES as WhiserLanguages - WHISPER_LANGS = tuple(WhiserLanguages.keys()) if use_prev: # i.e., hotword list _prev = [special_tokens["sot_prev"]]