From 9c469fc5cdac6f0db5d1de1379412167249d2bef Mon Sep 17 00:00:00 2001 From: Nikolay Karpov Date: Tue, 12 Dec 2023 02:02:06 -0800 Subject: [PATCH] use convert_hf_dataset_to_nemo Signed-off-by: Nikolay Karpov --- .../convert_hf_dataset_to_nemo.py | 2 +- tutorials/asr/Multilang_ASR.ipynb | 37 ++++++++++++++----- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/scripts/speech_recognition/convert_hf_dataset_to_nemo.py b/scripts/speech_recognition/convert_hf_dataset_to_nemo.py index 852c0bc161c2..2cb7ae56df60 100644 --- a/scripts/speech_recognition/convert_hf_dataset_to_nemo.py +++ b/scripts/speech_recognition/convert_hf_dataset_to_nemo.py @@ -361,7 +361,7 @@ def main(cfg: HFDatasetConversionConfig): split=cfg.split, cache_dir=None, streaming=cfg.streaming, - use_auth_token=cfg.use_auth_token, + token=cfg.use_auth_token, ) except Exception as e: diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb index 7a11cb7dc6a6..b9aa36acf5ce 100644 --- a/tutorials/asr/Multilang_ASR.ipynb +++ b/tutorials/asr/Multilang_ASR.ipynb @@ -240,7 +240,21 @@ "id": "ATf7lapcbAxB" }, "source": [ - "Now, let's download the Mozilla CommonVoice Spanish dataset. We will ignore the larger train file and get just the test and dev parts for the purposes of this tutorial. For good results, you will need to get the train files and likely other datasets too, bringing the total to over 1k hours. " + "Now, let's download the Mozilla CommonVoice Spanish dataset. We will ignore the larger train file and get just the test part for the purposes of this tutorial. For good results, you will need to get the train files and likely other datasets too, bringing the total to over 1k hours. \n", + "\n", + "Website steps:\n", + "- Visit https://huggingface.co/settings/profile\n", + "- Visit \"Access Tokens\" on list of items.\n", + "- Create new token - provide a name for the token and \"read\" access is sufficient.\n", + " - PRESERVE THAT TOKEN API KEY. You can copy that key for next step.\n", + "- Visit the [HuggingFace Dataset page for Mozilla Common Voice 3.0](https://huggingface.co/datasets/mozilla-foundation/common_voice_3_0)\n", + " - There should be a section that asks you for your approval.\n", + " - Make sure you are logged in and then read that agreement.\n", + " - If and only if you agree to the text, then accept the terms.\n", + "\n", + "Code steps:\n", + "- Now below, run `login()` \n", + "- Paste your preserved HF TOKEN API KEY to the text box." ] }, { @@ -261,7 +275,8 @@ }, "outputs": [], "source": [ - "!mkdir -p datasets/mcv3" + "from huggingface_hub import login\n", + "login()" ] }, { @@ -270,7 +285,7 @@ "id": "YpZNMYfKde9n" }, "source": [ - "We will use the `get_commonvoice_data.py` script located in the nemo/scripts/dataset_processing dir if you cloned NeMo repo" + "We will use the `convert_hf_dataset_to_nemo.py` script located in the nemo/scripts/speech_recognition dir if you cloned NeMo repo" ] }, { @@ -295,8 +310,8 @@ }, "outputs": [], "source": [ - "if not os.path.exists(\"get_commonvoice_data.py\"):\n", - " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/dataset_processing/get_commonvoice_data.py" + "if not os.path.exists(\"convert_hf_dataset_to_nemo.py\"):\n", + " !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_hf_dataset_to_nemo.py" ] }, { @@ -321,7 +336,10 @@ }, "outputs": [], "source": [ - "!python get_commonvoice_data.py --data_root \"datasets/mcv3\" --language es --cleanup --version cv-corpus-3" + "! python convert_hf_dataset_to_nemo.py \\\n", + " path=\"mozilla-foundation/common_voice_3_0\" \\\n", + " output_dir=\"datasets\" name=\"es\" split='test' \\\n", + " use_auth_token=True" ] }, { @@ -351,9 +369,8 @@ }, "outputs": [], "source": [ - "!head -1000 commonvoice_dev_manifest.json > commonvoice_dev_manifest_1000.json\n", - "!cat commonvoice_test_manifest.json >> commonvoice_train_manifest.json\n", - "!tail -1617 commonvoice_dev_manifest.json >> commonvoice_train_manifest.json" + "!head -1000 datasets/mozilla-foundation/common_voice_3_0/es/test/test_mozilla-foundation_common_voice_3_0_manifest.json > commonvoice_dev_manifest_1000.json\n", + "!tail -1729 datasets/mozilla-foundation/common_voice_3_0/es/test/test_mozilla-foundation_common_voice_3_0_manifest.json > commonvoice_train_manifest.json" ] }, { @@ -616,7 +633,7 @@ }, "outputs": [], "source": [ - "es_files = ['datasets/mcv3/dev/wav/common_voice_es_18309780.wav', 'datasets/mcv3/dev/wav/common_voice_es_18311421.wav']" + "es_files = ['datasets/mozilla-foundation/common_voice_3_0/es/test/clips/common_voice_es_18481930.wav', 'datasets/mozilla-foundation/common_voice_3_0/es/test/clips/common_voice_es_18481932.wav']" ] }, {