Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[vec]voxceleb convert dataset format to paddlespeech #1630

Merged
merged 13 commits into from
Apr 11, 2022
Merged
8 changes: 6 additions & 2 deletions dataset/rir_noise/rir_noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL_ROOT = 'http://www.openslr.org/resources/28'
URL_ROOT = '--no-check-certificate http://www.openslr.org/resources/28'
DATA_URL = URL_ROOT + '/rirs_noises.zip'
MD5_DATA = 'e6f48e257286e05de56413b4779d8ffb'

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/Aishell",
default=DATA_HOME + "/rirs_noise",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
Expand Down Expand Up @@ -81,6 +81,10 @@ def create_manifest(data_dir, manifest_path_prefix):
},
ensure_ascii=False))
manifest_path = manifest_path_prefix + '.' + dtype

if not os.path.exists(os.path.dirname(manifest_path)):
os.makedirs(os.path.dirname(manifest_path))

with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')
Expand Down
2 changes: 1 addition & 1 deletion dataset/voxceleb/voxceleb1.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
# we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory
if not os.path.exists(os.path.join(target_dir, "wav")):
# download all dataset part
print("start to download the vox1 dev zip package")
print(f"start to download the vox1 zip package to {target_dir}")
for zip_part in data_list.keys():
download_url = " --no-check-certificate " + base_url + "/" + zip_part
download(
Expand Down
81 changes: 70 additions & 11 deletions dataset/voxceleb/voxceleb2.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@
import glob
import json
import os
import subprocess
from pathlib import Path

import soundfile

from utils.utility import check_md5sum
from utils.utility import download
from utils.utility import unzip

Expand All @@ -35,12 +37,22 @@
BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"

# dev data
DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
DEV_LIST = {
"vox2_dev_aac_partaa": "da070494c573e5c0564b1d11c3b20577",
"vox2_dev_aac_partab": "17fe6dab2b32b48abaf1676429cdd06f",
"vox2_dev_aac_partac": "1de58e086c5edf63625af1cb6d831528",
"vox2_dev_aac_partad": "5a043eb03e15c5a918ee6a52aad477f9",
"vox2_dev_aac_partae": "cea401b624983e2d0b2a87fb5d59aa60",
"vox2_dev_aac_partaf": "fc886d9ba90ab88e7880ee98effd6ae9",
"vox2_dev_aac_partag": "d160ecc3f6ee3eed54d55349531cb42e",
"vox2_dev_aac_partah": "6b84a81b9af72a9d9eecbb3b1f602e65",
}

DEV_TARGET_DATA = "vox2_dev_aac_parta* vox2_dev_aac.zip bbc063c46078a602ca71605645c2a402"

# test data
TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip'
TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
TEST_LIST = {"vox2_test_aac.zip": "0d2b3ea430a821c33263b5ea37ede312"}
TEST_TARGET_DATA = "vox2_test_aac.zip vox2_test_aac.zip 0d2b3ea430a821c33263b5ea37ede312"

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
Expand Down Expand Up @@ -68,6 +80,14 @@


def create_manifest(data_dir, manifest_path_prefix):
"""Generate the voxceleb2 dataset manifest file.
We will create the ${manifest_path_prefix}.vox2 as the final manifest file
The dev and test wav info will be put in one manifest file.

Args:
data_dir (str): voxceleb2 wav directory, which include dev and test subdataset
manifest_path_prefix (str): manifest file prefix
"""
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
data_path = os.path.join(data_dir, "**", "*.wav")
Expand Down Expand Up @@ -119,7 +139,19 @@ def create_manifest(data_dir, manifest_path_prefix):
print(f"{total_sec / total_num} sec/utt", file=f)


def download_dataset(url, md5sum, target_dir, dataset):
def download_dataset(base_url, data_list, target_data, target_dir, dataset):
"""Download the voxceleb2 zip package

Args:
base_url (str): the voxceleb2 dataset download baseline url
data_list (dict): the dataset part zip package and the md5 value
target_data (str): the final dataset zip info
target_dir (str): the dataset stored directory
dataset (str): the dataset name, dev or test

Raises:
RuntimeError: the md5sum occurs error
"""
if not os.path.exists(target_dir):
os.makedirs(target_dir)

Expand All @@ -129,9 +161,34 @@ def download_dataset(url, md5sum, target_dir, dataset):
# but the test dataset will unzip to aac
# so, wo create the ${target_dir}/test and unzip the m4a to test dir
if not os.path.exists(os.path.join(target_dir, dataset)):
filepath = download(url, md5sum, target_dir)
print(f"start to download the vox2 zip package to {target_dir}")
for zip_part in data_list.keys():
download_url = " --no-check-certificate " + base_url + "/" + zip_part
download(
url=download_url,
md5sum=data_list[zip_part],
target_dir=target_dir)

# pack the all part to target zip file
all_target_part, target_name, target_md5sum = target_data.split()
target_name = os.path.join(target_dir, target_name)
if not os.path.exists(target_name):
pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
target_name)
subprocess.call(pack_part_cmd, shell=True)

# check the target zip file md5sum
if not check_md5sum(target_name, target_md5sum):
raise RuntimeError("{} MD5 checkssum failed".format(target_name))
else:
print("Check {} md5sum successfully".format(target_name))

if dataset == "test":
unzip(filepath, os.path.join(target_dir, "test"))
# we need make the test directory
unzip(target_name, os.path.join(target_dir, "test"))
else:
# upzip dev zip pacakge and will create the dev directory
unzip(target_name, target_dir)


def main():
Expand All @@ -142,14 +199,16 @@ def main():
print("download: {}".format(args.download))
if args.download:
download_dataset(
url=DEV_DATA_URL,
md5sum=DEV_MD5SUM,
base_url=BASE_URL,
data_list=DEV_LIST,
target_data=DEV_TARGET_DATA,
target_dir=args.target_dir,
dataset="dev")

download_dataset(
url=TEST_DATA_URL,
md5sum=TEST_MD5SUM,
base_url=BASE_URL,
data_list=TEST_LIST,
target_data=TEST_TARGET_DATA,
target_dir=args.target_dir,
dataset="test")

Expand Down
15 changes: 8 additions & 7 deletions examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
###########################################
# Data #
###########################################
# we should explicitly specify the wav path of vox2 audio data converted from m4a
vox2_base_path:
augment: True
batch_size: 16
batch_size: 32
num_workers: 2
num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
shuffle: True
skip_prep: False
split_ratio: 0.9
chunk_duration: 3.0 # seconds
random_chunk: True
verification_file: data/vox1/veri_test2.txt

###########################################################
# FEATURE EXTRACTION SETTING #
Expand All @@ -26,7 +28,6 @@ hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
# if we want use another model, please choose another configuration yaml file
model:
input_size: 80
# "channels": [512, 512, 512, 512, 1536],
channels: [1024, 1024, 1024, 1024, 3072]
kernel_sizes: [5, 3, 3, 3, 1]
dilations: [1, 2, 3, 4, 1]
Expand All @@ -38,8 +39,8 @@ model:
###########################################
seed: 1986 # according from speechbrain configuration
epochs: 10
save_interval: 1
log_interval: 1
save_interval: 10
log_interval: 10
learning_rate: 1e-8


Expand Down
53 changes: 53 additions & 0 deletions examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
###########################################
# Data #
###########################################
augment: True
batch_size: 16
num_workers: 2
num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
shuffle: True
skip_prep: False
split_ratio: 0.9
chunk_duration: 3.0 # seconds
random_chunk: True
verification_file: data/vox1/veri_test2.txt

###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
# currently, we only support fbank
sr: 16000 # sample rate
n_mels: 80
window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400
hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160

###########################################################
# MODEL SETTING #
###########################################################
# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
# if we want use another model, please choose another configuration yaml file
model:
input_size: 80
channels: [512, 512, 512, 512, 1536]
kernel_sizes: [5, 3, 3, 3, 1]
dilations: [1, 2, 3, 4, 1]
attention_channels: 128
lin_neurons: 192

###########################################
# Training #
###########################################
seed: 1986 # according from speechbrain configuration
epochs: 100
save_interval: 10
log_interval: 10
learning_rate: 1e-8


###########################################
# Testing #
###########################################
global_embedding_norm: True
embedding_mean_norm: True
embedding_std_norm: False

Loading