Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add speaker_tasks datasets folder, add diarization datasets voxconverse/aishell #5042

Merged
merged 5 commits into from
Oct 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/asr/speaker_diarization/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ We provide a helper script to download the dataset and format it into a NeMo man
.. code-block:: bash
python scripts/data_processing/get_ami_data.py --manifest_filepath AMItest_input_manifest.json
python scripts/data_processing/speaker_tasks/get_ami_data.py --manifest_filepath AMItest_input_manifest.json
CallHome American English Speech (CHAES), LDC97S42
Expand Down
5 changes: 5 additions & 0 deletions scripts/dataset_processing/speaker_tasks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Speaker Tasks Dataset Scripts

In this folder are scripts to download speaker tasks (mainly for diarization) datasets. These scripts will return NeMo format manifest files to use with Diarization.

We also have scripts for CallHome and DIHARD3, however the data has to be downloaded separately. If you require the scripts please leave an issue.
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# downloads the training/eval set for AISHELL Diarization.
# the training dataset is around 170GiB, to skip pass the --skip_train flag.

import argparse
import glob
import logging
import os
import tarfile
from pathlib import Path

import wget
from sox import Transformer

from nemo.collections.asr.parts.utils.manifest_utils import create_manifest

train_url = "https://www.openslr.org/resources/111/train_{}.tar.gz"
train_datasets = ["S", "M", "L"]

eval_url = "https://www.openslr.org/resources/111/test.tar.gz"


def extract_file(filepath: str, data_dir: str):
try:
tar = tarfile.open(filepath)
tar.extractall(data_dir)
tar.close()
except Exception:
logging.info("Not extracting. Maybe already there?")


def __process_data(dataset_url: str, dataset_path: Path, manifest_output_path: Path):
os.makedirs(dataset_path, exist_ok=True)
tar_file_path = os.path.join(dataset_path, os.path.basename(dataset_url))
if not os.path.exists(tar_file_path):
wget.download(dataset_url, tar_file_path)
extract_file(tar_file_path, str(dataset_path))
wav_path = dataset_path / 'converted_wav/'
extracted_dir = Path(tar_file_path).stem.replace('.tar', '')
flac_path = dataset_path / (extracted_dir + '/wav/')
__process_flac_audio(flac_path, wav_path)

audio_files = [os.path.join(os.path.abspath(wav_path), file) for file in os.listdir(str(wav_path))]
rttm_files = glob.glob(str(dataset_path / (extracted_dir + '/TextGrid/*.rttm')))
rttm_files = [os.path.abspath(file) for file in rttm_files]

audio_list = dataset_path / 'audio_files.txt'
rttm_list = dataset_path / 'rttm_files.txt'
with open(audio_list, 'w') as f:
f.write('\n'.join(audio_files))
with open(rttm_list, 'w') as f:
f.write('\n'.join(rttm_files))
create_manifest(
str(audio_list), manifest_output_path, rttm_path=str(rttm_list),
)


def __process_flac_audio(flac_path, wav_path):
os.makedirs(wav_path, exist_ok=True)
flac_files = os.listdir(flac_path)
for flac_file in flac_files:
# Convert FLAC file to WAV
id = Path(flac_file).stem
wav_file = os.path.join(wav_path, id + ".wav")
if not os.path.exists(wav_file):
Transformer().build(os.path.join(flac_path, flac_file), wav_file)


def main():
parser = argparse.ArgumentParser(description="Aishell Data download")
parser.add_argument("--data_root", default='./', type=str)
parser.add_argument("--output_manifest_path", default='aishell_diar_manifest.json', type=str)
parser.add_argument("--skip_train", help="skip downloading the training dataset", action="store_true")
args = parser.parse_args()
data_root = Path(args.data_root)
data_root.mkdir(exist_ok=True, parents=True)

if not args.skip_train:
for tag in train_datasets:
dataset_url = train_url.format(tag)
dataset_path = data_root / f'{tag}/'
manifest_output_path = data_root / f'train_{tag}_manifest.json'
__process_data(
dataset_url=dataset_url, dataset_path=dataset_path, manifest_output_path=manifest_output_path
)
# create test dataset
dataset_path = data_root / f'eval/'
manifest_output_path = data_root / f'eval_manifest.json'
__process_data(dataset_url=eval_url, dataset_path=dataset_path, manifest_output_path=manifest_output_path)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# USAGE: python get_aishell_data.py --data_root=<where to put data>
# USAGE: python get_hi-mia_data.py --data_root=<where to put data>

import argparse
import json
Expand Down
89 changes: 89 additions & 0 deletions scripts/dataset_processing/speaker_tasks/get_voxconverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# downloads the training/eval set for VoxConverse.

import argparse
import logging
import os
import zipfile
from pathlib import Path

import wget

from nemo.collections.asr.parts.utils.manifest_utils import create_manifest

dev_url = "https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_dev_wav.zip"
test_url = "https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_test_wav.zip"
rttm_annotations_url = "https://github.com/joonson/voxconverse/archive/refs/heads/master.zip"


def extract_file(filepath: Path, data_dir: Path):
try:
with zipfile.ZipFile(str(filepath), 'r') as zip_ref:
zip_ref.extractall(str(data_dir))
except Exception:
logging.info("Not extracting. Maybe already there?")


def _generate_manifest(data_root: Path, audio_path: Path, rttm_path: Path, manifest_output_path: Path):
audio_list = str(data_root / 'audio_file.txt')
rttm_list = str(data_root / 'rttm_file.txt')
with open(audio_list, 'w') as f:
f.write('\n'.join([str(os.path.join(rttm_path, x)) for x in os.listdir(audio_path)]))
with open(rttm_list, 'w') as f:
f.write('\n'.join([str(os.path.join(rttm_path, x)) for x in os.listdir(rttm_path)]))
create_manifest(
audio_list, str(manifest_output_path), rttm_path=rttm_list,
)


def main():
parser = argparse.ArgumentParser(description="VoxConverse Data download")
parser.add_argument("--data_root", default='./', type=str)
args = parser.parse_args()
data_root = Path(args.data_root)
data_root.mkdir(exist_ok=True, parents=True)

test_path = data_root / os.path.basename(test_url)
dev_path = data_root / os.path.basename(dev_url)
rttm_path = data_root / os.path.basename(rttm_annotations_url)

if not os.path.exists(test_path):
test_path = wget.download(test_url, str(data_root))
if not os.path.exists(dev_path):
dev_path = wget.download(dev_url, str(data_root))
if not os.path.exists(rttm_path):
rttm_path = wget.download(rttm_annotations_url, str(data_root))

extract_file(test_path, data_root / 'test/')
extract_file(dev_path, data_root / 'dev/')
extract_file(rttm_path, data_root)

_generate_manifest(
data_root=data_root,
audio_path=os.path.abspath(data_root / 'test/voxconverse_test_wav/'),
rttm_path=os.path.abspath(data_root / 'voxconverse-master/test/'),
manifest_output_path=data_root / 'test_manifest.json',
)
_generate_manifest(
data_root=data_root,
audio_path=os.path.abspath(data_root / 'dev/audio/'),
rttm_path=os.path.abspath(data_root / 'voxconverse-master/dev/'),
manifest_output_path=data_root / 'dev_manifest.json',
)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
"source": [
"In this tutorial, we shall first train these embeddings on speaker-related datasets, and then get speaker embeddings from a pretrained network for a new dataset. Since Google Colab has very slow read-write speeds, I'll be demonstrating this tutorial using [an4](http://www.speech.cs.cmu.edu/databases/an4/). \n",
"\n",
"Instead, if you'd like to try on a bigger dataset like [hi-mia](https://arxiv.org/abs/1912.01231) use the [get_hi-mia-data.py](https://github.com/NVIDIA/NeMo/blob/stable/scripts/dataset_processing/get_hi-mia_data.py) script to download the necessary files, extract them, and resample to 16Khz if any of these samples are not at 16Khz. "
"Instead, if you'd like to try on a bigger dataset like [hi-mia](https://arxiv.org/abs/1912.01231) use the [get_hi-mia-data.py](https://github.com/NVIDIA/NeMo/tree/main/scripts/dataset_processing/speaker_tasks/get_hi-mia_data.py) script to download the necessary files, extract them, and resample to 16Khz if any of these samples are not at 16Khz. "
]
},
{
Expand Down