NVIDIA · SeanNaren · Oct 3, 2022 · Sep 29, 2022 · Sep 29, 2022 · Sep 29, 2022
diff --git a/docs/source/asr/speaker_diarization/datasets.rst b/docs/source/asr/speaker_diarization/datasets.rst
@@ -233,7 +233,7 @@ We provide a helper script to download the dataset and format it into a NeMo man
 
 .. code-block:: bash
 
-    python scripts/data_processing/get_ami_data.py --manifest_filepath AMItest_input_manifest.json
+    python scripts/data_processing/speaker_tasks/get_ami_data.py --manifest_filepath AMItest_input_manifest.json
 
 
 CallHome American English Speech (CHAES), LDC97S42

diff --git a/scripts/dataset_processing/speaker_tasks/README.md b/scripts/dataset_processing/speaker_tasks/README.md
@@ -0,0 +1,5 @@
+# Speaker Tasks Dataset Scripts
+
+In this folder are scripts to download speaker tasks (mainly for diarization) datasets. These scripts will return NeMo format manifest files to use with Diarization.
+
+We also have scripts for CallHome and DIHARD3, however the data has to be downloaded separately. If you require the scripts please leave an issue.
diff --git a/scripts/dataset_processing/speaker_tasks/get_aishell_diarization_data.py b/scripts/dataset_processing/speaker_tasks/get_aishell_diarization_data.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# downloads the training/eval set for AISHELL Diarization.
+# the training dataset is around 170GiB, to skip pass the --skip_train flag.
+
+import argparse
+import glob
+import logging
+import os
+import tarfile
+from pathlib import Path
+
+import wget
+from sox import Transformer
+
+from nemo.collections.asr.parts.utils.manifest_utils import create_manifest
+
+train_url = "https://www.openslr.org/resources/111/train_{}.tar.gz"
+train_datasets = ["S", "M", "L"]
+
+eval_url = "https://www.openslr.org/resources/111/test.tar.gz"
+
+
+def extract_file(filepath: str, data_dir: str):
+    try:
+        tar = tarfile.open(filepath)
+        tar.extractall(data_dir)
+        tar.close()
+    except Exception:
+        logging.info("Not extracting. Maybe already there?")
+
+
+def __process_data(dataset_url: str, dataset_path: Path, manifest_output_path: Path):
+    os.makedirs(dataset_path, exist_ok=True)
+    tar_file_path = os.path.join(dataset_path, os.path.basename(dataset_url))
+    if not os.path.exists(tar_file_path):
+        wget.download(dataset_url, tar_file_path)
+    extract_file(tar_file_path, str(dataset_path))
+    wav_path = dataset_path / 'converted_wav/'
+    extracted_dir = Path(tar_file_path).stem.replace('.tar', '')
+    flac_path = dataset_path / (extracted_dir + '/wav/')
+    __process_flac_audio(flac_path, wav_path)
+
+    audio_files = [os.path.join(os.path.abspath(wav_path), file) for file in os.listdir(str(wav_path))]
+    rttm_files = glob.glob(str(dataset_path / (extracted_dir + '/TextGrid/*.rttm')))
+    rttm_files = [os.path.abspath(file) for file in rttm_files]
+
+    audio_list = dataset_path / 'audio_files.txt'
+    rttm_list = dataset_path / 'rttm_files.txt'
+    with open(audio_list, 'w') as f:
+        f.write('\n'.join(audio_files))
+    with open(rttm_list, 'w') as f:
+        f.write('\n'.join(rttm_files))
+    create_manifest(
+        str(audio_list), manifest_output_path, rttm_path=str(rttm_list),
+    )
+
+
+def __process_flac_audio(flac_path, wav_path):
+    os.makedirs(wav_path, exist_ok=True)
+    flac_files = os.listdir(flac_path)
+    for flac_file in flac_files:
+        # Convert FLAC file to WAV
+        id = Path(flac_file).stem
+        wav_file = os.path.join(wav_path, id + ".wav")
+        if not os.path.exists(wav_file):
+            Transformer().build(os.path.join(flac_path, flac_file), wav_file)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Aishell Data download")
+    parser.add_argument("--data_root", default='./', type=str)
+    parser.add_argument("--output_manifest_path", default='aishell_diar_manifest.json', type=str)
+    parser.add_argument("--skip_train", help="skip downloading the training dataset", action="store_true")
+    args = parser.parse_args()
+    data_root = Path(args.data_root)
+    data_root.mkdir(exist_ok=True, parents=True)
+
+    if not args.skip_train:
+        for tag in train_datasets:
+            dataset_url = train_url.format(tag)
+            dataset_path = data_root / f'{tag}/'
+            manifest_output_path = data_root / f'train_{tag}_manifest.json'
+            __process_data(
+                dataset_url=dataset_url, dataset_path=dataset_path, manifest_output_path=manifest_output_path
+            )
+    # create test dataset
+    dataset_path = data_root / f'eval/'
+    manifest_output_path = data_root / f'eval_manifest.json'
+    __process_data(dataset_url=eval_url, dataset_path=dataset_path, manifest_output_path=manifest_output_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/dataset_processing/get_ami_data.py → ..._processing/speaker_tasks/get_ami_data.py b/scripts/dataset_processing/get_ami_data.py → ..._processing/speaker_tasks/get_ami_data.py
diff --git a/...pts/dataset_processing/get_hi-mia_data.py → ...ocessing/speaker_tasks/get_hi-mia_data.py b/...pts/dataset_processing/get_hi-mia_data.py → ...ocessing/speaker_tasks/get_hi-mia_data.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# USAGE: python get_aishell_data.py --data_root=<where to put data>
+# USAGE: python get_hi-mia_data.py --data_root=<where to put data>
 
 import argparse
 import json

diff --git a/scripts/dataset_processing/speaker_tasks/get_voxconverse.py b/scripts/dataset_processing/speaker_tasks/get_voxconverse.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# downloads the training/eval set for VoxConverse.
+
+import argparse
+import logging
+import os
+import zipfile
+from pathlib import Path
+
+import wget
+
+from nemo.collections.asr.parts.utils.manifest_utils import create_manifest
+
+dev_url = "https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_dev_wav.zip"
+test_url = "https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_test_wav.zip"
+rttm_annotations_url = "https://github.com/joonson/voxconverse/archive/refs/heads/master.zip"
+
+
+def extract_file(filepath: Path, data_dir: Path):
+    try:
+        with zipfile.ZipFile(str(filepath), 'r') as zip_ref:
+            zip_ref.extractall(str(data_dir))
+    except Exception:
+        logging.info("Not extracting. Maybe already there?")
+
+
+def _generate_manifest(data_root: Path, audio_path: Path, rttm_path: Path, manifest_output_path: Path):
+    audio_list = str(data_root / 'audio_file.txt')
+    rttm_list = str(data_root / 'rttm_file.txt')
+    with open(audio_list, 'w') as f:
+        f.write('\n'.join([str(os.path.join(rttm_path, x)) for x in os.listdir(audio_path)]))
+    with open(rttm_list, 'w') as f:
+        f.write('\n'.join([str(os.path.join(rttm_path, x)) for x in os.listdir(rttm_path)]))
+    create_manifest(
+        audio_list, str(manifest_output_path), rttm_path=rttm_list,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(description="VoxConverse Data download")
+    parser.add_argument("--data_root", default='./', type=str)
+    args = parser.parse_args()
+    data_root = Path(args.data_root)
+    data_root.mkdir(exist_ok=True, parents=True)
+
+    test_path = data_root / os.path.basename(test_url)
+    dev_path = data_root / os.path.basename(dev_url)
+    rttm_path = data_root / os.path.basename(rttm_annotations_url)
+
+    if not os.path.exists(test_path):
+        test_path = wget.download(test_url, str(data_root))
+    if not os.path.exists(dev_path):
+        dev_path = wget.download(dev_url, str(data_root))
+    if not os.path.exists(rttm_path):
+        rttm_path = wget.download(rttm_annotations_url, str(data_root))
+
+    extract_file(test_path, data_root / 'test/')
+    extract_file(dev_path, data_root / 'dev/')
+    extract_file(rttm_path, data_root)
+
+    _generate_manifest(
+        data_root=data_root,
+        audio_path=os.path.abspath(data_root / 'test/voxconverse_test_wav/'),
+        rttm_path=os.path.abspath(data_root / 'voxconverse-master/test/'),
+        manifest_output_path=data_root / 'test_manifest.json',
+    )
+    _generate_manifest(
+        data_root=data_root,
+        audio_path=os.path.abspath(data_root / 'dev/audio/'),
+        rttm_path=os.path.abspath(data_root / 'voxconverse-master/dev/'),
+        manifest_output_path=data_root / 'dev_manifest.json',
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb b/tutorials/speaker_tasks/Speaker_Identification_Verification.ipynb
@@ -58,7 +58,7 @@
    "source": [
     "In this tutorial, we shall first train these embeddings on speaker-related datasets, and then get speaker embeddings from a pretrained network for a new dataset. Since Google Colab has very slow read-write speeds, I'll be demonstrating this tutorial using [an4](http://www.speech.cs.cmu.edu/databases/an4/). \n",
     "\n",
-    "Instead, if you'd like to try on a bigger dataset like [hi-mia](https://arxiv.org/abs/1912.01231) use the [get_hi-mia-data.py](https://github.com/NVIDIA/NeMo/blob/stable/scripts/dataset_processing/get_hi-mia_data.py) script to download the necessary files, extract them, and resample to 16Khz if any of these samples are not at 16Khz. "
+    "Instead, if you'd like to try on a bigger dataset like [hi-mia](https://arxiv.org/abs/1912.01231) use the [get_hi-mia-data.py](https://github.com/NVIDIA/NeMo/tree/main/scripts/dataset_processing/speaker_tasks/get_hi-mia_data.py) script to download the necessary files, extract them, and resample to 16Khz if any of these samples are not at 16Khz. "
    ]
   },
   {