diff --git a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb index 4845981c7d10..3ada56f9dfb2 100644 --- a/tutorials/tts/FastPitch_GermanTTS_Training.ipynb +++ b/tutorials/tts/FastPitch_GermanTTS_Training.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "9e23311d", + "id": "2d65b905", "metadata": {}, "source": [ "# Modifying FastPitch to Train on a Non-English (German) Dataset\n", @@ -16,12 +16,12 @@ }, { "cell_type": "markdown", - "id": "2e57b884", + "id": "42273266", "metadata": {}, "source": [ "# License\n", "\n", - "> Copyright 2022 NVIDIA. All Rights Reserved.\n", + "> Copyright 2023 NVIDIA. All Rights Reserved.\n", "> \n", "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", "> you may not use this file except in compliance with the License.\n", @@ -39,31 +39,47 @@ { "cell_type": "code", "execution_count": null, - "id": "392161ff", + "id": "f4c77c34", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", - "You can either run this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", + "\n", "Instructions for setting up Colab are as follows:\n", "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies# .\n", + "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL).\n", + "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator).\n", + "4. Run this cell to set up dependencies.\n", + "5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect.\n", "\"\"\"\n", - "BRANCH = 'r1.16.0'\n", - "# # If you're using Colab and not running locally, uncomment and run this cell.\n", + "\n", + "# If you're using Google Colab and not running locally, run this cell.\n", + "\n", + "## Install dependencies\n", "# !apt-get install sox libsndfile1 ffmpeg\n", - "# !pip install wget text-unidecode pynini==2.1.4 scipy==1.7.3\n", - "# !python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n", - "# !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/nemo_text_processing/install_pynini.sh\n", - "# !bash install_pynini.sh" + "# !pip install wget text-unidecode matplotlib>=3.3.2\n", + "\n", + "## Install NeMo\n", + "BRANCH = 'r1.16.0'\n", + "# !python -m pip install \"git+https://github.com/NVIDIA/NeMo.git@${BRANCH}#egg=nemo_toolkit[all]\"\n", + "\n", + "## Install pynini\n", + "# !wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/install_pynini.sh\n", + "# !bash install_pynini.sh\n", + "\n", + "\"\"\"\n", + "Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!\n", + "Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case\n", + "that you want to use the \"Run All Cells\" (or similar) option.\n", + "\"\"\"\n", + "# exit()" ] }, { "cell_type": "code", "execution_count": null, - "id": "d9a5e132", + "id": "f018747c", "metadata": {}, "outputs": [], "source": [ @@ -80,18 +96,20 @@ { "cell_type": "code", "execution_count": null, - "id": "c588ff4f", + "id": "797bb19a", "metadata": {}, "outputs": [], "source": [ "# let's download the files we need to run this tutorial\n", - "\n", + "!rm -rf NeMoGermanTTS\n", "!mkdir NeMoGermanTTS\n", - "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/dataset_processing/tts/openslr_95/get_data.py\n", + "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/dataset_processing/tts/thorsten_neutral/get_data.py\n", + "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/dataset_processing/tts/thorsten_neutral/ds_conf/ds_for_fastpitch_align.yaml\n", "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/fastpitch.py\n", "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/hifigan_finetune.py\n", "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/dataset_processing/tts/extract_sup_data.py\n", - "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/de/fastpitch_align_22050.yaml\n", + "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/scripts/dataset_processing/tts/generate_mels.py\n", + "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml\n", "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/tts/conf/hifigan/hifigan.yaml\n", "!cd NeMoGermanTTS && wget https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/nemo_text_processing/text_normalization/de/data/whitelist.tsv\n", "!cd NeMoGermanTTS && mkdir -p model/train_ds && cd model/train_ds && wget https://raw.githubusercontent.com/nvidia/NeMo/$BRANCH/examples/tts/conf/hifigan/model/train_ds/train_ds_finetune.yaml\n", @@ -101,7 +119,7 @@ }, { "cell_type": "markdown", - "id": "c3b37631", + "id": "d9e63f12", "metadata": {}, "source": [ "# Introduction" @@ -109,7 +127,7 @@ }, { "cell_type": "markdown", - "id": "ba60fc45", + "id": "74b10183", "metadata": {}, "source": [ "### FastPitch\n", @@ -124,7 +142,7 @@ }, { "cell_type": "markdown", - "id": "747a9f24", + "id": "f8b533b7", "metadata": {}, "source": [ "# Dataset Preparation" @@ -132,471 +150,134 @@ }, { "cell_type": "markdown", - "id": "e1cf7a8d", + "id": "89d94d7c", "metadata": {}, "source": [ "We will show example of preprocessing and training using OpenSLR's German Neutral TTS dataset ([link](https://www.openslr.org/95)). It is a free single german speaker dataset (> 23 hours) by Thorsten Müller (voice) and Dominik Kreutz (audio optimization) for tts training. \n", "\n", "In this section, we will cover:\n", "1. Downloading the dataset\n", - "2. Creating manifests\n", - "3. Normalizing text\n", - "4. Phonemization\n", - "5. Creating dataset config\n", - "6. Creating supplementary data" + "2. Extracting supplementary data" ] }, { "cell_type": "markdown", - "id": "40e76a38", + "id": "0863d60c", "metadata": {}, "source": [ - "## 1. Downloading the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36b8a1d3", - "metadata": {}, - "outputs": [], - "source": [ - "!mkdir DataGermanTTS && \\\n", - " cd DataGermanTTS && \\\n", - " wget https://us.openslr.org/resources/95/thorsten-de_v02.tgz && \\\n", - " tar -zxvf thorsten-de_v02.tgz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6db032e5", - "metadata": {}, - "outputs": [], - "source": [ - "# DataGermanTTS directory looks like\n", - "!ls DataGermanTTS -R" - ] - }, - { - "cell_type": "markdown", - "id": "0e94204b", - "metadata": {}, - "source": [ - "\n", - "```bash\n", - "$ ls DataGermanTTS -R\n", - "DataGermanTTS:\n", - "thorsten-de thorsten-de_v02.tgz\n", - "\n", - "DataGermanTTS/thorsten-de:\n", - "metadata.csv metadata_shuf.csv metadata_train.csv metadata_val.csv wavs\n", - "\n", - "DataGermanTTS/thorsten-de/wavs:\n", - "00025a6fbea659dae6ece011e749aa34.wav 80689a91d5c8e32847ccbba2322e2122.wav\n", - "000314280388fb390b3e70b69ee53a23.wav 8068cbcbe28085c15d2e8a8f7291d009.wav\n", - "000624f768d7e282534a850980619fb2.wav 8071b84557c9a780d23414e241393f00.wav\n", - "000c5b9d181c934e8a343fb561c928bd.wav 8073badcfce74546b2bab83f76dbf043.wav\n", - "001364406f288f03136403c611fff1dc.wav 80743e8e580d128673e181cc15b47cc9.wav\n", - "...\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "c4f33db9", - "metadata": {}, - "source": [ - "## 2. Creating manifests \n", + "## 1. Download Thorsten's Neutral Dataset and Create Data Manifests \n", "\n", - "We've created `scripts/dataset_processing/tts/openslr_95/get_data.py` script that reads the `DataGermanTTS/thorsten-de/metadata.csv` provided with the dataset and generates the following fields per each datapoint:\n", - "1. `audio_filepath`: location of the wav file\n", - "2. `duration`: duration of the wav file\n", - "3. `text`: original text supplied by OpenSLR\n", + "We've created a script, `scripts/dataset_processing/tts/thorsten_neutral/get_data.py`, to download Thorsten's Datasets and generate train/val/test splits as JSON manifests with the following fields for each datapoint:\n", + "1. `audio_filepath`: location of the wav file;\n", + "2. `duration`: duration of the wav file;\n", + "3. `text`: original text;\n", + "4. `normalized_text`: normalized text through our text normalization pipline.\n", " \n", - "After that, the script randomly splits the datapoints into 3 buckets, `train_manifest.json`, `val_manifest.json` and `test_manifest.json`.\n", + "This script supports processing either of Thorsten's Neutral Datasets 21.02 or 22.10. In this tutorial, we only focus on the latest 22.10 version dataset. Please refer [thorsten-muller-s-german-neutral-tts-datasets](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tts/datasets.html#thorsten-muller-s-german-neutral-tts-datasets) for more details about Thorsten's datasets. \n", "\n", - "Note: This step will take sometime to run for the entire dataset. If you are only interested in testing the scripts, please feel free to shorten the `DataGermanTTS/thorsten-de/metadata.csv` file to include only, say, top 100 records." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb063cf4", - "metadata": {}, - "outputs": [], - "source": [ - "!(cd NeMoGermanTTS && \\\n", - " python get_data.py \\\n", - " --data-root ../DataGermanTTS/ \\\n", - " --val-size 0.1 \\\n", - " --test-size 0.2)" - ] - }, - { - "cell_type": "markdown", - "id": "3811cc56", - "metadata": {}, - "source": [ - "In the example above, 10% datapoints go to validation set, 20% go to test set and the remaining 70% go to training set." + "You can run the below command to obtain the final manifests, `train_manifest_text_normed.json`, `val_manifest_text_normed.json` and `test_manifest_text_normed.json`. **Note** that this script would take sometime (~2 hours) to dowload and normalize the entire dataset." ] }, { "cell_type": "code", "execution_count": null, - "id": "5c7b9430", + "id": "3384c8fe", "metadata": {}, "outputs": [], "source": [ - "# DataGermanTTS directory looks like\n", - "!ls DataGermanTTS -R" - ] - }, - { - "cell_type": "markdown", - "id": "4d2dd715", - "metadata": {}, - "source": [ - "```bash\n", - "$ ls DataGermanTTS -R\n", - "DataGermanTTS:\n", - "thorsten-de\n", - "thorsten-de_v02.tgz\n", - "\n", - "DataGermanTTS/thorsten-de:\n", - "metadata.csv\n", - "metadata_shuf.csv\n", - "metadata_train.csv\n", - "metadata_val.csv\n", - "test_manifest.json\n", - "train_manifest.json\n", - "val_manifest.json\n", - "wavs\n", - "\n", - "DataGermanTTS/thorsten-de/wavs:\n", - "00025a6fbea659dae6ece011e749aa34.wav\n", - "000314280388fb390b3e70b69ee53a23.wav\n", - "000624f768d7e282534a850980619fb2.wav\n", - "...\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "2f6ea189", - "metadata": {}, - "source": [ - "## 3. Normalizing text\n", - "\n", - "The script above, i.e. `scripts/dataset_processing/tts/openslr_95/get_data.py`, also generates another field per each datapoint:\n", - "- `normalized_text`: normalized text via NeMo's text normalizer:\n", - " ```python\n", - " nemo_text_processing.text_normalization.normalize.Normalizer(lang=\"de\", input_case=\"cased\", overwrite_cache=True, cache_dir=str(file_path / \"cache_dir\"))\n", - " ```\n", - " \n", - "German language text normalizer (defined here: `nemo_text_processing/text_normalization/de`) was created using the tutorial shared under NeMo's `Grammar customization` documentation [here](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/nlp/text_normalization/wfst/wfst_text_normalization.html#grammar-customization). Here are some example records:\n", - "```json\n", - "{\"audio_filepath\": \"DataGermanTTS/thorsten-de/wavs/f1becc89cb4079a123ead68c9c8bb8ae.wav\", \"duration\": 7.250023, \"text\": \"Öffne den Webbrowser und rufe www.archlinux.org auf.\", \"normalized_text\": \"Öffne den Webbrowser und rufe w w w punkt a r c h l i n u x punkt o r g auf.\"}\n", - "```\n", - "Notice that the URL has been spelled out. \n", - "\n", - "In other cases, the normalized text may look the same as text, example:\n", - "```json\n", - "{\"audio_filepath\": \"DataGermanTTS/thorsten-de/wavs/e50eb02c25353f85549900d2fc1e0e32.wav\", \"duration\": 2.409977, \"text\": \"Geht die Schandtat auf sein Konto?\", \"normalized_text\": \"Geht die Schandtat auf sein Konto?\"}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "6c7f253e", - "metadata": {}, - "source": [ - "## 4. Phonemization\n", - "\n", - "The pronunciation of a word can be represented as a string of phones, which are minimal speech sound units, each represented with symbols adapted from the Roman alphabet. The IPA is designed to represent those qualities of speech that are part of lexical (and to a limited extent prosodic) sounds in spoken form: phones, phonemes, intonation and the separation of words and syllables. Training models with phonemes as well as text will help the model generate more accurate speech sounds." + "# Version 22.10\n", + "!cd NeMoGermanTTS && python get_data.py \\\n", + " --data-root ../data_thorsten_2210 \\\n", + " --manifests-root ./ \\\n", + " --data-version \"22_10\" \\\n", + " --val-size 100 \\\n", + " --test-size 100 \\\n", + " --seed-for-ds-split 100 \\\n", + " --num-workers -1 \\\n", + " --normalize-text" ] }, { "cell_type": "code", "execution_count": null, - "id": "b6c8dad0", + "id": "8a7cf49e", "metadata": {}, "outputs": [], "source": [ - "print(\"text: Geht die Schandtat auf sein Konto?\")\n", - "print(\"phoneme: \\u0261e\\u02d0t di\\u02d0 \\u0283ant\\u0251\\u02d0t a\\u028af za\\u026an k\\u0254nto\\u02d0\")" - ] - }, - { - "cell_type": "markdown", - "id": "f5a88926", - "metadata": {}, - "source": [ - "The original JSON dataset split generated from `get_data.py` only contains text/grapheme inputs. We recommend adding phonemes as well to obtain better quality of synthesized audios. So you would expect the dataset double sized. In order to add phonemes, we need external tools to convert German text into phonemes. There are several open-sourced external tools handling such phoneme transliteration. You may choose any per your interests. But in this tutorial,we demonstrate the process using [bootphon/phonemizer](https://github.com/bootphon/phonemizer) that applies espeak backend. Before running, please install phonemizer on your local machine via `pip install` and `apt-get install` as shown below," + "# data_thorsten_2210 directory looks like\n", + "!ls ./data_thorsten_2210/ThorstenVoice-Dataset-22_10/ThorstenVoice-Dataset_2022.10/" ] }, { "cell_type": "code", "execution_count": null, - "id": "2ffe9bb6", + "id": "bc6f120f", "metadata": {}, "outputs": [], "source": [ - "!pip install phonemizer && apt-get install espeak-ng" + "# manifests\n", + "!ls ./NeMoGermanTTS/*_text_normed.json" ] }, { "cell_type": "markdown", - "id": "f6a79794", - "metadata": {}, - "source": [ - "Alternatively, you can use phonemizer via docker container:\n", - "```bash\n", - "git clone https://github.com/bootphon/phonemizer\n", - "cd phonemizer\n", - "docker build -t phonemizer .\n", - "docker run --rm -d -it -p 8888:8888 -v DataGermanTTS:DataGermanTTS --ipc=host phonemizer /bin/bash\n", - "docker exec -it /bin/bash\n", - "```\n", - "\n", - "Other install methods for phonemizer are listed [here](https://bootphon.github.io/phonemizer/install.html). The following code snippet shows a general guidance about how to transliterate graphemes into phonemes using phonemizer tool. You could manually run it to append phoneme manifest." - ] - }, - { - "cell_type": "markdown", - "source": [ - "```python\n", - "import json\n", - "from pathlib import Path\n", - "from phonemizer.backend import EspeakBackend\n", - "from tqdm import tqdm\n", - "\n", - "def phonemization(manifest, language):\n", - " # you can also consider with_stress=True and add stress symbols into charset of tokenizer for experimental purpose.\n", - " backend = EspeakBackend(language=language, preserve_punctuation=True)\n", - " print(f\"Phonemizing: {manifest}\")\n", - " entries = []\n", - " with open(manifest, 'r') as fjson:\n", - " for line in tqdm(fjson):\n", - " # grapheme\n", - " grapheme_dct = json.loads(line.strip())\n", - " grapheme_dct.update({\"is_phoneme\": 0})\n", - " # phoneme\n", - " phoneme_dct = grapheme_dct.copy()\n", - " # you can also add a separator.Separator(phone=\"_\") to distinguish phone or word boundaries for experimental purpose.\n", - " phonemes = backend.phonemize([grapheme_dct[\"normalized_text\"]], strip=True)\n", - " phoneme_dct[\"normalized_text\"] = phonemes[0]\n", - " phoneme_dct[\"is_phoneme\"] = 1\n", - "\n", - " entries.append(grapheme_dct)\n", - " entries.append(phoneme_dct)\n", - "\n", - " output_manifest_filepath = manifest.parent / f\"{manifest.stem}_phonemes{manifest.suffix}\"\n", - " with open(output_manifest_filepath, \"w\", encoding=\"utf-8\") as fout:\n", - " for entry in entries:\n", - " fout.write(f\"{json.dumps(entry)}\\n\")\n", - " print(f\"Phonemizing is complete: {manifest} --> {output_manifest_filepath}\")\n", - "\n", - "input_manifest_filepaths = [\n", - " \"DataGermanTTS/thorsten-de/train_manifest.json\",\n", - " \"DataGermanTTS/thorsten-de/test_manifest.json\",\n", - " \"DataGermanTTS/thorsten-de/val_manifest.json\"\n", - "]\n", - "\n", - "language = 'de'\n", - "for manifest in input_manifest_filepaths:\n", - " phonemization(Path(manifest), language)\n", - "```" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "markdown", - "id": "e0e4a206", + "id": "1c42e260", "metadata": {}, "source": [ - "To better understand the phonemize method, refer to the docs [here](https://github.com/bootphon/phonemizer/blob/master/phonemizer/backend/base.py#L137).\n", - "\n", - "Run the above script for train, test and val records, resulting in `train_manifest_phonemes.json`, `test_manifest_phonemes.json` and `val_manifest_phonemes.json` respectively.\n", + "## 2. Extracting Supplementary Data\n", "\n", - "We are effectively doubling the size of our dataset. Each original record maps on to two records, one with original `normalized_text` field value and `is_phoneme` set to 0 and another with phonemized text and `is_phoneme` flag set to 1.\n", + "As mentioned in the [FastPitch and MixerTTS training tutorial](FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean, std, min, and max). To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script.\n", "\n", - "Example of input record:\n", - "```json\n", - "{\"audio_filepath\": \"DataGermanTTS/thorsten-de/wavs/e50eb02c25353f85549900d2fc1e0e32.wav\", \"duration\": 2.409977, \"text\": \"Geht die Schandtat auf sein Konto?\", \"normalized_text\": \"Geht die Schandtat auf sein Konto?\"}\n", - "```\n", - "And corresponding output records:\n", - "```json\n", - "{\"audio_filepath\": \"DataGermanTTS/thorsten-de/wavs/e50eb02c25353f85549900d2fc1e0e32.wav\", \"duration\": 2.409977, \"text\": \"Geht die Schandtat auf sein Konto?\", \"normalized_text\": \"Geht die Schandtat auf sein Konto?\", \"is_phoneme\": 0}\n", - "{\"audio_filepath\": \"DataGermanTTS/thorsten-de/wavs/e50eb02c25353f85549900d2fc1e0e32.wav\", \"duration\": 2.409977, \"text\": \"Geht die Schandtat auf sein Konto?\", \"normalized_text\": \"\\u0261e\\u02d0t di\\u02d0 \\u0283ant\\u0251\\u02d0t a\\u028af za\\u026an k\\u0254nto\\u02d0 \", \"is_phoneme\": 1}\n", - "```" + "**Note**: This is an optional step, if skipped, it will be automatically executed within the first epoch of training FastPitch." ] }, { "cell_type": "markdown", - "id": "18578da2", + "id": "13d1a32b", "metadata": {}, "source": [ - "## 5. Creating dataset config\n", - "\n", - "Most of the configuration remains the same as described in [FastPitch and MixerTTS training tutorial](FastPitch_MixerTTS_Training.ipynb) except:\n", - "1. The `text_tokenizer._target_` is set to `nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanCharsTokenizer` class defined here: `collections/common/tokenizers/text_to_speech/tts_tokenizers.py`, and pass `is_phoneme: true`, which will extend the `de_alphabet` to include IPA symbols, resulting in `abcdefghijklmnopqrstuvwxyzäöüßʊʃŋɜːɛɾəɪçɔøɡœɑÜ„1Q̃ɒʒÄɹÖʌθàó̈ðéɐá`. \n", - "\n", - "2. The `text_normalizer.lang` is set to `de`, in order to use the tokenizer defined in `nemo_text_processing/text_normalization/de` (as discussed above).\n", - "\n", - "3. Update the `whitelist_path` to point to German whitelist: `nemo_text_processing/text_normalization/de/data/whitelist.tsv`\n", - "\n", - "Final config looks like:\n", - "\n", - "```yaml\n", - "name: \"ds_for_fastpitch_align\"\n", - "\n", - "manifest_filepath: \"train_manifest.json\"\n", - "sup_data_path: \"sup_data\"\n", - "sup_data_types: [ \"align_prior_matrix\", \"pitch\" ]\n", - "whitelist_path: \"NeMoGermanTTS/whitelist.tsv\"\n", - "\n", - "dataset:\n", - " _target_: nemo.collections.tts.torch.data.TTSDataset\n", - " manifest_filepath: ${manifest_filepath}\n", - " sample_rate: 22050\n", - " sup_data_path: ${sup_data_path}\n", - " sup_data_types: ${sup_data_types}\n", - " n_fft: 1024\n", - " win_length: 1024\n", - " hop_length: 256\n", - " window: \"hann\"\n", - " n_mels: 80\n", - " lowfreq: 0\n", - " highfreq: 8000\n", - " max_duration: null\n", - " min_duration: 0.1\n", - " ignore_file: null\n", - " trim: false\n", - " pitch_fmin: 65.40639132514966\n", - " pitch_fmax: 2093.004522404789\n", - "\n", - " text_normalizer:\n", - " _target_: nemo_text_processing.text_normalization.normalize.Normalizer\n", - " lang: de\n", - " input_case: cased\n", - " whitelist: ${whitelist_path}\n", - "\n", - " text_normalizer_call_kwargs:\n", - " verbose: false\n", - " punct_pre_process: true\n", - " punct_post_process: true\n", - "\n", - " text_tokenizer:\n", - " _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanCharsTokenizer\n", - " punct: true\n", - " apostrophe: true\n", - " pad_with_space: true\n", - " phonemes: true\n", - "```\n", - "\n", - "Save the above config in `NeMoGermanTTS/ds_for_fastpitch_align.yaml`." + "The configuration remains the same as described in `scripts/dataset_processing/tts/thorsten_neutral/ds_conf/ds_for_fastpitch_align.yaml`, except that `whitelist_path` should point to `whitelist.tsv` in this tutorial." ] }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "tmp = '''\\\n", - "name: \"ds_for_fastpitch_align\"\n", - "\n", - "manifest_filepath: \"train_manifest.json\"\n", - "sup_data_path: \"sup_data\"\n", - "sup_data_types: [ \"align_prior_matrix\", \"pitch\" ]\n", - "whitelist_path: \"NeMoGermanTTS/whitelist.tsv\"\n", - "\n", - "dataset:\n", - " _target_: nemo.collections.tts.torch.data.TTSDataset\n", - " manifest_filepath: ${manifest_filepath}\n", - " sample_rate: 22050\n", - " sup_data_path: ${sup_data_path}\n", - " sup_data_types: ${sup_data_types}\n", - " n_fft: 1024\n", - " win_length: 1024\n", - " hop_length: 256\n", - " window: \"hann\"\n", - " n_mels: 80\n", - " lowfreq: 0\n", - " highfreq: 8000\n", - " max_duration: null\n", - " min_duration: 0.1\n", - " ignore_file: null\n", - " trim: false\n", - " pitch_fmin: 65.40639132514966\n", - " pitch_fmax: 2093.004522404789\n", - "\n", - " text_normalizer:\n", - " _target_: nemo_text_processing.text_normalization.normalize.Normalizer\n", - " lang: de\n", - " input_case: cased\n", - " whitelist: ${whitelist_path}\n", - "\n", - " text_normalizer_call_kwargs:\n", - " verbose: false\n", - " punct_pre_process: true\n", - " punct_post_process: true\n", - "\n", - " text_tokenizer:\n", - " _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.GermanCharsTokenizer\n", - " punct: true\n", - " apostrophe: true\n", - " pad_with_space: true\n", - " phonemes: true\n", - "'''\n", - "with open('NeMoGermanTTS/ds_for_fastpitch_align.yaml', 'w') as f:\n", - " f.write(tmp)" - ], - "metadata": { - "collapsed": false - } - }, { "cell_type": "markdown", - "id": "b515c5b4", + "id": "9588c280", "metadata": {}, "source": [ - "## 6. Creating Supplementary Data\n", - "\n", - "As mentioned in the [FastPitch and MixerTTS training tutorial](FastPitch_MixerTTS_Training.ipynb) - To accelerate and stabilize our training, we also need to extract pitch for every audio, estimate pitch statistics (mean and std) and pre-calculate alignment prior matrices for alignment framework. To do this, all we need to do is iterate over our data one time, via `extract_sup_data.py` script.\n", - "\n", - "Note: This is an optional step, if skipped, it will be automatically executed within the first epoch of training FastPitch." + "Run below command line and you will obtain the pitch normalization parameters estimated on the entire dataset. It will cost around 1.5 hours." ] }, { "cell_type": "code", "execution_count": null, - "id": "114dabfc", + "id": "808b5048", "metadata": {}, "outputs": [], "source": [ - "!python NeMoGermanTTS/extract_sup_data.py \\\n", + "!cd NeMoGermanTTS && python extract_sup_data.py \\\n", " --config-path . \\\n", " --config-name ds_for_fastpitch_align.yaml \\\n", - " manifest_filepath=DataGermanTTS/thorsten-de/train_manifest_phonemes.json \\\n", - " sup_data_path=DataGermanTTS/thorsten-de/phonemes/" + " manifest_filepath=train_manifest_text_normed.json \\\n", + " sup_data_path=sup_data \\\n", + " whitelist_path=whitelist.tsv \\\n", + " ++dataloader_params.num_workers=4" ] }, { "cell_type": "markdown", - "id": "adfba0f9", + "id": "bb2d7f2b", "metadata": {}, "source": [ - "The above example gives the following result:\n", - "1. Creates two folders under `sup_data_path` - `pitch` and `align_prior_matrix`\n", - "2. Prints out some values for pitch mean and standard deviation: `PITCH_MEAN, PITCH_STD = 132.524658203125, 37.389366149902344`. Use these values while training FastPitch." + "After running the above command line, you will observe a new folder `NeMoGermanTTS/sup_data/pitch` and printouts of pitch statistics below. Specify these values to the FastPitch training configurations. We will be there in the following section.\n", + "```bash\n", + "PITCH_MEAN=126.70349884033203, PITCH_STD=31.456707000732422\n", + "PITCH_MIN=65.4063949584961, PITCH_MAX=1998.48779296875\n", + "```" ] }, { "cell_type": "markdown", - "id": "3278f1ee", + "id": "6f4c41e9", "metadata": {}, "source": [ "# Training" @@ -604,32 +285,15 @@ }, { "cell_type": "markdown", - "id": "1ef91842", - "metadata": {}, - "source": [ - "Before we train our model, let's define model config. Most of the model config stays the same as defined here: `examples/tts/conf/fastpitch_align_44100.yaml`, except:\n", - "1. Make the same changes that we made in dataset config for `text_tokenizer`, `text_normalizer` and `whitelist_path`.\n", - "\n", - "2. The `pitch_mean` and `pitch_std` are updated to the values reported by the `extract_sup_data.py` script.\n", - "\n", - "3. The `sample_rate` is updated to 22050 KHz per our dataset. And accordingly halve the `n_window_size`, `n_window_stride` and `n_fft` parameters as well. \n", - "\n", - "We have already downloaded the config after making these changes here: `NeMoGermanTTS/fastpitch_align_22050.yaml`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7f2373e", + "id": "14c2ff60", "metadata": {}, - "outputs": [], "source": [ - "!cat NeMoGermanTTS/fastpitch_align_22050.yaml" + "Before we train our model, let's define model config. Most of the model config stays the same as defined here: `examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml`, except that `whitelist_path` should point to `whitelist.tsv` in this tutorial, and The `pitch_mean` and `pitch_std` should be updated with the values estimated by the above `extract_sup_data.py` script. " ] }, { "cell_type": "markdown", - "id": "1de1cf64", + "id": "ac938960", "metadata": {}, "source": [ "If you are using Weights and Biases, you may need to login first. More details [here](https://docs.wandb.ai/ref/cli/wandb-login)." @@ -638,8 +302,12 @@ { "cell_type": "code", "execution_count": null, - "id": "c60ed72a", - "metadata": {}, + "id": "0ab8b471", + "metadata": { + "tags": [ + "parameters" + ] + }, "outputs": [], "source": [ "!wandb login #paste_wandb_apikey_here" @@ -647,31 +315,31 @@ }, { "cell_type": "markdown", - "id": "51ec332e", + "id": "d4364261", "metadata": {}, "source": [ - "Now we are ready for training our model! Let's try to train FastPitch. Paste the PITCH_MEAN and PITCH_STD from previous steps here." + "Now we are ready for training our model! Let's try to train FastPitch. Copy and Paste the `PITCH_MEAN` and `PITCH_STD` from previous steps to overide `pitch_mean` and `pitch_std` configs below." ] }, { "cell_type": "code", "execution_count": null, - "id": "4ad43d30", + "id": "0e8e0cd7", "metadata": {}, "outputs": [], "source": [ - "!(cd NeMoGermanTTS && CUDA_VISIBLE_DEVICES=0 python fastpitch.py --config-path . --config-name fastpitch_align_22050 \\\n", + "!(cd NeMoGermanTTS && CUDA_VISIBLE_DEVICES=0 python fastpitch.py --config-path . --config-name fastpitch_align_22050_grapheme \\\n", " model.train_ds.dataloader_params.batch_size=32 \\\n", " model.validation_ds.dataloader_params.batch_size=32 \\\n", - " train_dataset=../DataGermanTTS/thorsten-de/train_manifest_phonemes.json \\\n", - " validation_datasets=../DataGermanTTS/thorsten-de/val_manifest_phonemes.json \\\n", - " sup_data_path=../DataGermanTTS/thorsten-de/phonemes/ \\\n", - " whitelist_path=./whitelist.tsv \\\n", + " train_dataset=train_manifest_text_normed.json \\\n", + " validation_datasets=val_manifest_text_normed.json \\\n", + " sup_data_path=sup_data \\\n", + " whitelist_path=whitelist.tsv \\\n", " exp_manager.exp_dir=resultGermanTTS \\\n", " trainer.max_epochs=1 \\\n", " trainer.check_val_every_n_epoch=1 \\\n", - " pitch_mean=#paste_pitch_mean_here \\\n", - " pitch_std=#paste_pitch_std_here \\\n", + " pitch_mean=126.70349884033203 \\\n", + " pitch_std=31.456707000732422 \\\n", " +exp_manager.create_wandb_logger=true \\\n", " +exp_manager.wandb_logger_kwargs.name=\"tutorial\" \\\n", " +exp_manager.wandb_logger_kwargs.project=\"GermanTTS\")" @@ -679,19 +347,19 @@ }, { "cell_type": "markdown", - "id": "b8082cfc", + "id": "fe0bfbb9", "metadata": {}, "source": [ "Note:\n", "1. We use `CUDA_VISIBLE_DEVICES=0` to limit training to single GPU.\n", "2. For debugging you may also add the following flags: `HYDRA_FULL_ERROR=1`, `CUDA_LAUNCH_BLOCKING=1`\n", "\n", - "Note: We've limited the above run to 1 epoch only, so we can validate the implementation within the scope of this tutorial. We recommend around 1000 epochs when training FastPitch from scratch." + "Note: We've limited the above run to 1 epoch only, so we can validate the implementation within the scope of this tutorial. We recommend around 5000 epochs when training FastPitch from scratch." ] }, { "cell_type": "markdown", - "id": "7a36f955", + "id": "632d51b0", "metadata": {}, "source": [ "## Evaluating FastPitch + pretrained HiFi-GAN\n", @@ -702,7 +370,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d70d5f7d", + "id": "053a00a2", "metadata": {}, "outputs": [], "source": [ @@ -715,22 +383,20 @@ { "cell_type": "code", "execution_count": null, - "id": "4e9ee07a", + "id": "d82bc36b", "metadata": {}, "outputs": [], "source": [ - "hfg_ngc = \"tts_hifigan\" # NGC pretrained model name: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_hifigan \n", - "fastpitch_model_path = \"\" # from the results directory\n", - "test = \"Diese Musiksammlung soll die Vielfalt des Lebens widerspiegeln.\" # text input to the model\n", - "test_id = \"877d9f668a877713b48735f282af62ca\" # identifier for the audio corresponding to the test text\n", - "data_path = \"DataGermanTTS/thorsten-de/wavs/\" # path to dataset folder with wav files from original dataset\n", + "test = \"das werden einmal \\u00fcber eine million kleiner buchenpflanzen werden.\" # text input to the model\n", + "test_id = \"2b2b496ccc9b57130f559c4fd827825f\" # identifier for the audio corresponding to the test text\n", + "data_path = \"data_thorsten_2210/ThorstenVoice-Dataset-22_10/ThorstenVoice-Dataset_2022.10/wavs/\" # path to dataset folder with wav files from original dataset\n", "seed = 1234" ] }, { "cell_type": "code", "execution_count": null, - "id": "32a234f4", + "id": "87ff970d", "metadata": {}, "outputs": [], "source": [ @@ -754,11 +420,18 @@ { "cell_type": "code", "execution_count": null, - "id": "9a3f5eaa", + "id": "70733af0", "metadata": {}, "outputs": [], "source": [ - "# load models\n", + "# load fastpitch and hifigan models\n", + "import glob, os\n", + "fastpitch_model_path = sorted(\n", + " glob.glob(\"NeMoGermanTTS/resultGermanTTS/FastPitch/*/checkpoints/FastPitch.nemo\"), \n", + " key=os.path.getmtime\n", + ")[-1] # path_to_fastpitch_nemo_or_ckpt\n", + "hfg_ngc = \"tts_en_lj_hifigan_ft_mixerttsx\" # NGC pretrained model name: https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_en_lj_hifigan \n", + "\n", "vocoder_model = HifiGanModel.from_pretrained(hfg_ngc, strict=False).eval().cuda()\n", "if \".nemo\" in fastpitch_model_path:\n", " spec_gen_model = FastPitchModel.restore_from(fastpitch_model_path).eval().cuda()\n", @@ -769,7 +442,7 @@ { "cell_type": "code", "execution_count": null, - "id": "de04b514", + "id": "27aa1827", "metadata": {}, "outputs": [], "source": [ @@ -791,25 +464,25 @@ }, { "cell_type": "markdown", - "id": "e57e9baf", + "id": "15006392", "metadata": {}, "source": [ - "We see that audio quality is not as good as we expect, even after training FastPitch for 1000 epochs. One of the ways mentioned in the [FastPitch_Finetuning.ipynb](FastPitch_Finetuning.ipynb) tutorial is to finetune HiFi-GAN. Lets try that out next!" + "You would hear that the above synthesized audio quality is not as good as we expect. It would be improved after continuing to train 5000 epochs. But again,the quality is still not acceptable. A straightforward solution is to finetune the HiFi-GAN model following the tutorial [FastPitch_Finetuning.ipynb](FastPitch_Finetuning.ipynb). Let's try that out next!" ] }, { "cell_type": "markdown", - "id": "e6d62337", + "id": "a7002b1e", "metadata": {}, "source": [ "# Finetuning HiFi-GAN\n", "\n", - "Improving speech quality by Finetuning HiFi-GAN on synthesized mel-spectrograms from FastPitch. " + "Improving speech quality by finetuning HiFi-GAN on synthesized mel-spectrograms from FastPitch. " ] }, { "cell_type": "markdown", - "id": "4efcf84d", + "id": "4eccd357", "metadata": {}, "source": [ "## Generating synthetic mels\n", @@ -820,19 +493,18 @@ { "cell_type": "code", "execution_count": null, - "id": "59d12874", + "id": "a5edb6fe", "metadata": {}, "outputs": [], "source": [ - "test_audio_filepath = \"DataGermanTTS/thorsten-de/wavs/5d000c81c8e7c4817cbfd7c4b8738feb.wav\"\n", - "test_audio_text = \"Dieser Geruch, wenn jemand eine Clementine \\u00f6ffnet!\"\n", - "fastpitch_model_path = \"\"" + "test_audio_filepath = \"data_thorsten_2210/ThorstenVoice-Dataset-22_10/ThorstenVoice-Dataset_2022.10/wavs/43ee28172fe1a9d1eebd77bc09f03e51.wav\"\n", + "test_audio_text = \"daran beteiligten sich einhundertachtzig kommunen und hochschulen.\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "9395abc3", + "id": "769f270a", "metadata": {}, "outputs": [], "source": [ @@ -865,7 +537,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1679a9f9", + "id": "aef1470b", "metadata": {}, "outputs": [], "source": [ @@ -874,7 +546,7 @@ }, { "cell_type": "markdown", - "id": "4a2cb665", + "id": "5a840dcf", "metadata": {}, "source": [ "So we have 2 types of mel spectrograms that we can use for finetuning HiFi-GAN:\n", @@ -885,14 +557,14 @@ { "cell_type": "code", "execution_count": null, - "id": "cb1ec7d4", + "id": "60d49e67", "metadata": {}, "outputs": [], "source": [ "print(\"loading original melspec\")\n", "y, sr = librosa.load(test_audio_filepath)\n", "# change n_fft, win_length, hop_length parameters below based on your specific config file\n", - "spectrogram2 = np.log(librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, win_length=1024, hop_length=256))\n", + "spectrogram2 = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, win_length=1024, hop_length=256)\n", "spectrogram = spectrogram2[ :80, :]\n", "print(\"spectrogram shape = \", spectrogram.shape)\n", "plot_logspec(spectrogram)\n", @@ -901,7 +573,7 @@ }, { "cell_type": "markdown", - "id": "a06ab269", + "id": "5641b7c4", "metadata": {}, "source": [ "### 2. Mel spectrogram predicted from FastPitch" @@ -910,7 +582,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2ba1586a", + "id": "5f448efd", "metadata": {}, "outputs": [], "source": [ @@ -929,10 +601,10 @@ }, { "cell_type": "markdown", - "id": "bf1639d4", + "id": "5c240c79", "metadata": {}, "source": [ - "Note: The above spectrogram has the duration 291 which is not equal to the ground truth length, i.e. 315. In order to finetune HiFi-GAN we need mel spectrogram predicted from FastPitch with groundtruth alignment and duration.\n", + "**Note**: The above predicted spectrogram has the duration of 241 frames which is not equal to the ground truth 345 frames. In order to finetune HiFi-GAN we need mel spectrogram predicted from FastPitch with ground truth alignment and duration.\n", "\n", "### 2.1 Mel spectrogram predicted from FastPitch with groundtruth alignment and duration " ] @@ -940,7 +612,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f9fde5e7", + "id": "ddd0eef5", "metadata": {}, "outputs": [], "source": [ @@ -973,120 +645,33 @@ }, { "cell_type": "markdown", - "id": "f49b70b9", + "id": "a2c99521", "metadata": {}, "source": [ "In our experience, \n", "- Finetuning with #1 has artifacts from the original audio (noise) that get passed on as input to the vocoder resulting in artifacts in vocoder output in the form of noise.\n", "- On the other hand, #2.1 (i.e. `Mel spectrogram predicted from FastPitch with groundtruth alignment and duration`) gives the best results because it enables HiFi-GAN to learn mel spectrograms generated by FastPitch as well as duration distributions closer to the real world (i.e. ground truth) durations. \n", "\n", - "From implementation perspective - we follow the same process described in [Finetuning FastPitch for a new speaker](FastPitch_Finetuning.ipynb) - i.e. take the latest checkpoint from FastPitch training and predict spectrograms for each of the input records in `train_manifest_phonemes.json`, `test_manifest_phonemes.json` and `val_manifest_phonemes.json`." + "From implementation perspective - we follow the same process described in [Finetuning FastPitch for a new speaker](FastPitch_Finetuning.ipynb) - i.e. take the latest checkpoint from FastPitch training and predict spectrograms for each of the input records in `train_manifest_text_normed.json`, `test_manifest_text_normed.json` and `val_manifest_text_normed.json`. NeMo provides an efficient script, [scripts/dataset_processing/tts/generate_mels.py](https://raw.githubusercontent.com/nvidia/NeMo/main/scripts/dataset_processing/tts/generate_mels.py), to generate Mel-spectrograms in the directory `NeMoGermanTTS/mels` and also create new JSON manifests with a suffix `_mel` by adding a new key `\"mel_filepath\"`. For example, `train_manifest_text_normed.json` corresponds to `train_manifest_text_normed_mel.json` saved in the same directory. You can run the following CLI to obtain the new JSON manifests." ] }, { "cell_type": "code", "execution_count": null, - "id": "4d2c5e5d", + "id": "c1dc3025", "metadata": {}, "outputs": [], "source": [ - "import json\n", - "import numpy as np\n", - "import torch\n", - "import soundfile as sf\n", - "import librosa\n", - "\n", - "from pathlib import Path\n", - "\n", - "from nemo.collections.tts.torch.helpers import BetaBinomialInterpolator\n", - "\n", - "folder_name = \"synmels\"\n", - "fastpitch_model_path = \"\"\n", - "dataset_parts = [\"test_manifest_phonemes\", \"val_manifest_phonemes\", \"train_manifest_phonemes\"]\n", - "dataset_base_path = \"DataGermanTTS/\"\n", - "\n", - "from nemo.collections.tts.models import FastPitchModel\n", - "if \".nemo\" in fastpitch_model_path:\n", - " spec_model = FastPitchModel.restore_from(fastpitch_model_path).eval().cuda()\n", - "else:\n", - " spec_model = FastPitchModel.load_from_checkpoint(checkpoint_path=fastpitch_model_path).eval().cuda()\n", - "\n", - "spec_model.eval().cuda()\n", - " \n", - "def load_wav(audio_file):\n", - " with sf.SoundFile(audio_file, 'r') as f:\n", - " samples = f.read(dtype='float32')\n", - " return samples.transpose()\n", - " \n", - "for dataset_part in dataset_parts:\n", - " # Get records from the manifest\n", - " manifest_path = f\"{dataset_base_path}thorsten-de/{dataset_part}.json\"\n", - " records = []\n", - " with open(manifest_path, \"r\") as f:\n", - " for i, line in enumerate(f):\n", - " records.append(json.loads(line))\n", - "\n", - " beta_binomial_interpolator = BetaBinomialInterpolator()\n", - "\n", - " spec_model.eval()\n", - " device = spec_model.device\n", - "\n", - " save_dir = Path(f\"{dataset_base_path}{folder_name}/{dataset_part}\")\n", - "\n", - " save_dir.mkdir(exist_ok=True, parents=True)\n", - "\n", - " # Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)\n", - " for i, r in enumerate(records):\n", - " audio = load_wav(r[\"audio_filepath\"])\n", - "\n", - " audio = torch.from_numpy(audio).unsqueeze(0).to(device)\n", - " audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)\n", - "\n", - " # Again, our finetuned FastPitch model doesn't use multiple speakers,\n", - " # but we keep the code to support it here for reference\n", - " if spec_model.fastpitch.speaker_emb is not None and \"speaker\" in r:\n", - " speaker = torch.tensor([r['speaker']]).to(device)\n", - " else:\n", - " speaker = None\n", - "\n", - " with torch.no_grad():\n", - " if \"normalized_text\" in r:\n", - " text = spec_model.parse(r[\"normalized_text\"], normalize=False)\n", - " else:\n", - " text = spec_model.parse(r['text'])\n", - "\n", - " text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)\n", - "\n", - " spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)\n", - "\n", - " # Generate attention prior and spectrogram inputs for HiFi-GAN\n", - " attn_prior = torch.from_numpy(\n", - " beta_binomial_interpolator(spect_len.item(), text_len.item())\n", - " ).unsqueeze(0).to(text.device)\n", - "\n", - " spectrogram = spec_model.forward(\n", - " text=text, \n", - " input_lens=text_len, \n", - " spec=spect, \n", - " mel_lens=spect_len, \n", - " attn_prior=attn_prior,\n", - " speaker=speaker,\n", - " )[0]\n", - "\n", - " save_path = save_dir / f\"mel_{i}.npy\"\n", - " np.save(save_path, spectrogram[0].to('cpu').numpy())\n", - " r[\"mel_filepath\"] = str(save_path)\n", - "\n", - " hifigan_manifest_path = f\"{dataset_base_path}{folder_name}/hifigan_{dataset_part}_ft.json\"\n", - "\n", - " with open(hifigan_manifest_path, \"w\") as f:\n", - " for r in records:\n", - " f.write(json.dumps(r) + '\\n')" + "!cd NeMoGermanTTS && python generate_mels.py \\\n", + " --cpu \\\n", + " --fastpitch-model-ckpt {fastpitch_model_path.split(\"/\", maxsplit=1)[1]} \\\n", + " --input-json-manifests train_manifest_text_normed.json val_manifest_text_normed.json test_manifest_text_normed.json \\\n", + " --output-json-manifest-root ./" ] }, { "cell_type": "markdown", - "id": "371506df", + "id": "d4cb8e4a", "metadata": {}, "source": [ "Revisiting how we implement #2.1 (i.e. Predicted mel spectrogram predicted from FastPitch with groundtruth alignment and duration):\n", @@ -1113,98 +698,57 @@ " )[0]\n", " ```\n", " \n", - "Repeat the above script for train and validation datasets as well. \n", - "\n", - "Finally, the `DataGermanTTS/synmels` will look like:\n", - "```\n", - "DataGermanTTS/synmels/:\n", - "hifigan_test_manifest_phonemes_ft.json\n", - "hifigan_train_manifest_phonemes_ft.json\n", - "hifigan_val_manifest_phonemes_ft.json\n", - "test_manifest_phonemes\n", - "train_manifest_phonemes\n", - "val_manifest_phonemes\n", - "\n", - "DataGermanTTS/synmels/test_manifest_phonemes:\n", - "mel_0.npy\n", - "mel_1.npy\n", - "...\n", - "\n", - "DataGermanTTS/synmels/train_manifest_phonemes:\n", - "mel_0.npy\n", - "mel_1.npy\n", - "...\n", - "\n", - "DataGermanTTS/synmels/val_manifest_phonemes:\n", - "mel_0.npy\n", - "mel_1.npy\n", - "...\n", - "```\n", - "\n", - "Example HiFi-GAN manifest:\n", - "```json\n", - "{\"audio_filepath\": \"DataGermanTTS/thorsten-de/wavs/e50eb02c25353f85549900d2fc1e0e32.wav\", \"duration\": 2.409977, \"text\": \"Geht die Schandtat auf sein Konto?\", \"normalized_text\": \"Geht die Schandtat auf sein Konto?\", \"mel_filepath\": \"DataGermanTTS/synmels/test_manifest_phonemes/mel_0.npy\"}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "565e07a8", - "metadata": {}, - "source": [ - "## Launch finetuning\n", - "\n", - "Download the pre-trained HiFi-GAN from NGC." + "Repeat the above script for train and validation datasets as well. " ] }, { "cell_type": "code", "execution_count": null, - "id": "fa06c0b1", + "id": "9b0918cc", "metadata": {}, "outputs": [], "source": [ - "!(cd DataGermanTTS && \\\n", - " wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_hifigan/versions/1.0.0rc1/zip -O tts_hifigan_1.0.0rc1.zip && \\\n", - " unzip tts_hifigan_1.0.0rc1.zip)" + "# Example HiFi-GAN manifest:\n", + "!head -n1 NeMoGermanTTS/train_manifest_text_normed_mel.json | jq" ] }, { "cell_type": "markdown", - "id": "fc7e8554", + "id": "61cf4696", "metadata": {}, "source": [ + "## Launch finetuning\n", + "\n", "We will be re-using the existing HiFi-GAN config and HiFi-GAN pretrained on English." ] }, { "cell_type": "code", "execution_count": null, - "id": "73b4cbc2", + "id": "712df66f", "metadata": {}, "outputs": [], "source": [ - "!(python NeMoGermanTTS/hifigan_finetune.py --config-path . --config-name hifigan.yaml \\\n", + "!cd NeMoGermanTTS && CUDA_VISIBLE_DEVICES=0 python hifigan_finetune.py --config-path . --config-name hifigan.yaml \\\n", " model.max_steps=10 \\\n", " model.optim.lr=0.00001 \\\n", " ~model.optim.sched \\\n", - " train_dataset=DataGermanTTS/synmels/hifigan_train_manifest_phonemes_ft.json \\\n", - " validation_datasets=DataGermanTTS/synmels/hifigan_val_manifest_phonemes_ft.json \\\n", + " train_dataset=train_manifest_text_normed_mel.json \\\n", + " validation_datasets=val_manifest_text_normed_mel.json \\\n", " exp_manager.exp_dir=resultGermanTTS \\\n", - " +init_from_nemo_model=DataGermanTTS/tts_hifigan.nemo \\\n", - " trainer.devices=-1 \\\n", + " +init_from_pretrained_model={hfg_ngc} \\\n", " +trainer.val_check_interval=5 \\\n", " trainer.check_val_every_n_epoch=null \\\n", " model/train_ds=train_ds_finetune \\\n", " model/validation_ds=val_ds_finetune \\\n", " exp_manager.create_wandb_logger=true \\\n", " exp_manager.wandb_logger_kwargs.name=\"tutorial_2\" \\\n", - " exp_manager.wandb_logger_kwargs.project=\"GermanTTS\")" + " exp_manager.wandb_logger_kwargs.project=\"GermanTTS\"" ] }, { "cell_type": "markdown", - "id": "c8721ea1", + "id": "f1e18615", "metadata": {}, "source": [ "Note: We've limited the above run to 10 steps only, so we can validate the implementation within the scope of this tutorial. We recommend evaluating around every 50 steps HiFi-GAN until you get desired quality results." @@ -1212,7 +756,7 @@ }, { "cell_type": "markdown", - "id": "a3b04d69", + "id": "bb4a8c7b", "metadata": {}, "source": [ "## Evaluating FastPitch and Finetuned HiFi-GAN\n", @@ -1223,12 +767,11 @@ { "cell_type": "code", "execution_count": null, - "id": "a62d54cf", + "id": "bd683d45", "metadata": {}, "outputs": [], "source": [ - "hfg_path = \"\"\n", - "fastpitch_model_path = \"\"\n", + "hfg_path = sorted(glob.glob(\"NeMoGermanTTS/resultGermanTTS/HifiGan/*/checkpoints/HifiGan.nemo\"), key=os.path.getmtime)[-1]\n", "\n", "if \".nemo\" in hfg_path:\n", " vocoder_model_pt = HifiGanModel.restore_from(hfg_path).eval().cuda()\n", @@ -1244,7 +787,7 @@ { "cell_type": "code", "execution_count": null, - "id": "94c2b645", + "id": "0a7b7aa2", "metadata": {}, "outputs": [], "source": [ @@ -1266,7 +809,7 @@ }, { "cell_type": "markdown", - "id": "dbe10199", + "id": "0700e6f4", "metadata": {}, "source": [ "That's it!" @@ -1289,7 +832,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.10" } }, "nbformat": 4,