From 26be9f276f918b7a2b8800f0e6fc783ce7baa038 Mon Sep 17 00:00:00 2001 From: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Date: Wed, 1 Mar 2023 13:29:21 -0800 Subject: [PATCH] [TTS]update German NGC models trained on Thorsten Datasets (#6125) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> --- docs/source/tts/data/datasets.csv | 3 ++- docs/source/tts/data/ngc_models_am.csv | 5 +++-- docs/source/tts/data/ngc_models_vocoder.csv | 3 ++- nemo/collections/tts/models/fastpitch.py | 17 +++++++++++---- nemo/collections/tts/models/hifigan.py | 23 +++++++++++++++------ 5 files changed, 37 insertions(+), 14 deletions(-) diff --git a/docs/source/tts/data/datasets.csv b/docs/source/tts/data/datasets.csv index 6d25b6ea1500..e39b10500931 100644 --- a/docs/source/tts/data/datasets.csv +++ b/docs/source/tts/data/datasets.csv @@ -2,7 +2,8 @@ Language,Locale,Dataset Name,#spk-total,#spk-F,#spk-M,#hours-total,#hour-F,#hour English,en-US,LJSpeech,1,1,0,23.92,23.92,0.00,"22,050Hz",https://keithito.com/LJ-Speech-Dataset/ English,en-US,LibriTTS (clean),1230,592,638,262.62,133.97,128.65,"24,000Hz",https://www.openslr.org/60/ English,en-US,HiFiTTS,10,6,4,291.60,158.30,133.30,"44,100Hz",http://www.openslr.org/109/ -German,de-DE,Thorsten Müller (German Neutral-TTS dataset),1,0,1,22.96,0.00,22.96,"22,050Hz",https://www.openslr.org/95/ +German,de-DE,Thorsten Müller Neutral 21.02 dataset,1,0,1,20.91,0.00,20.91,"22,050Hz",https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1 +German,de-DE,Thorsten Müller Neutral 22.10 dataset,1,0,1,11.21,0.00,11.21,"22,050Hz",https://zenodo.org/record/7265581/files/ThorstenVoice-Dataset_2022.10.zip?download=1 German,de-DE,HUI-Audio-Corpus-German (clean),118,n/a,n/a,253.85,0.00,0.00,"44,100Hz",https://opendata.iisys.de/datasets.html Spanish,es-AR,Crowdsourced high-quality Argentinian Spanish,44,31,13,8.03,5.61,2.42,"48,000Hz",https://www.openslr.org/61/ Spanish,es-CL,Crowdsourced high-quality Chilean Spanish,31,13,18,7.15,2.84,4.31,"48,000Hz",https://www.openslr.org/71/ diff --git a/docs/source/tts/data/ngc_models_am.csv b/docs/source/tts/data/ngc_models_am.csv index 5170db277579..6377ad617a63 100644 --- a/docs/source/tts/data/ngc_models_am.csv +++ b/docs/source/tts/data/ngc_models_am.csv @@ -1,4 +1,4 @@ -Locale,Model Name,Dataset,Sampling Rate,#Spk,Phoneme Unit,Model Class,Overview,Checkpoint +Locale,Model Name,Dataset,Sampling Rate,#Spk,Symbols,Model Class,Overview,Checkpoint en-US,tts_en_fastpitch,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_en_fastpitch `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/1.8.1/files/tts_en_fastpitch_align.nemo`` en-US,tts_en_fastpitch_ipa,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_en_fastpitch `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_fastpitch/versions/IPA_1.13.0/files/tts_en_fastpitch_align_ipa.nemo`` en-US,tts_en_fastpitch_multispeaker,HiFiTTS,44100Hz,10,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_en_multispeaker_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_multispeaker_fastpitchhifigan/versions/1.10.0/files/tts_en_fastpitch_multispeaker.nemo`` @@ -7,6 +7,7 @@ en-US,tts_en_lj_mixerttsx,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models en-US,RAD-TTS,TBD,TBD,TBD,ARPABET,nemo.collections.tts.models.radtts.RadTTSModel,TBD, en-US,tts_en_tacotron2,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.tacotron2.Tacotron2Model,`tts_en_tacotron2 `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_tacotron2/versions/1.10.0/files/tts_en_tacotron2.nemo`` de-DE,tts_de_fastpitch_multispeaker_5,HUI Audio Corpus German,44100Hz,5,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitch_multispeaker_5 `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_fastpitch_multispeaker_5.nemo`` -de-DE,tts_de_fastpitch_singlespeaker,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_fastpitch_align.nemo`` +de-DE,tts_de_fastpitch_singleSpeaker_thorstenNeutral_2102,Thorsten Müller Neutral 21.02 dataset,22050Hz,1,Graphemes,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_fastpitch_thorstens2102.nemo`` +de-DE,tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210,Thorsten Müller Neutral 22.10 dataset,22050Hz,1,Graphemes,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_fastpitch_thorstens2210.nemo`` es,tts_es_fastpitch_multispeaker,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,IPA,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_es_multispeaker_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.15.0/files/tts_es_fastpitch_multispeaker.nemo`` zh-CN,tts_zh_fastpitch_sfspeech,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,pinyin,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_zh_fastpitch_hifigan_sfspeech `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_fastpitch_sfspeech.nemo`` diff --git a/docs/source/tts/data/ngc_models_vocoder.csv b/docs/source/tts/data/ngc_models_vocoder.csv index 420c7319010f..01c885ce796b 100644 --- a/docs/source/tts/data/ngc_models_vocoder.csv +++ b/docs/source/tts/data/ngc_models_vocoder.csv @@ -8,6 +8,7 @@ en-US,tts_en_libritts_univnet,librosa.filters.mel,LibriTTS,24000Hz,1,nemo.collec en-US,tts_waveglow_88m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.waveglow.WaveGlowModel,`tts_waveglow_88m `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_waveglow_88m/versions/1.0.0/files/tts_waveglow.nemo`` en-US,tts_waveglow_268m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.tts.models.waveglow.WaveGlowModel,`tts_waveglow_268m `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_waveglow_268m/versions/1.0.0rc1/files/tts_waveglow_268m.nemo`` de-DE,tts_de_hui_hifigan_ft_fastpitch_multispeaker_5,FastPitch,HUI Audio Corpus German,44100Hz,5,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitch_multispeaker_5 `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_hui_hifigan_ft_fastpitch_multispeaker_5.nemo`` -de-DE,tts_de_slr_hifigan_ft_fastpitch_singlespeaker,FastPitch,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_hifigan.nemo`` +de-DE,tts_de_hifigan_singleSpeaker_thorstenNeutral_2102,FastPitch,Thorsten Müller Neutral 21.02 dataset,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_hifigan_thorstens2102.nemo`` +de-DE,tts_de_hifigan_singleSpeaker_thorstenNeutral_2210,FastPitch,Thorsten Müller Neutral 22.10 dataset,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_hifigan_thorstens2210.nemo`` es,tts_es_hifigan_ft_fastpitch_multispeaker,FastPitch,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_es_multispeaker_fastpitchhifigan `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.15.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemo`` zh-CN,tts_zh_hifigan_sfspeech,FastPitch,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_zh_fastpitch_hifigan_sfspeech `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_hifigan_sfspeech.nemo`` diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py index e488ff5e36e1..85627b903498 100644 --- a/nemo/collections/tts/models/fastpitch.py +++ b/nemo/collections/tts/models/fastpitch.py @@ -591,11 +591,20 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]': ) list_of_models.append(model) - # de-DE, single speaker, 22050 Hz, OpenSLR Neutral German Dataset. + # de-DE, single male speaker, grapheme-based tokenizer, 22050 Hz, Thorsten Müller’s German Neutral-TTS Dataset, 21.02 model = PretrainedModelInfo( - pretrained_model_name="tts_de_fastpitch_singlespeaker", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_fastpitch_align.nemo", - description="This model is trained on a single male speaker data in OpenSLR Neutral German Dataset sampled at 22050Hz and can be used to generate male German voices.", + pretrained_model_name="tts_de_fastpitch_singleSpeaker_thorstenNeutral_2102", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_fastpitch_thorstens2102.nemo", + description="This model is trained on a single male speaker data in Thorsten Müller’s German Neutral 21.02 Dataset sampled at 22050Hz and can be used to generate male German voices.", + class_=cls, + ) + list_of_models.append(model) + + # de-DE, single male speaker, grapheme-based tokenizer, 22050 Hz, Thorsten Müller’s German Neutral-TTS Dataset, 22.10 + model = PretrainedModelInfo( + pretrained_model_name="tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_fastpitch_thorstens2210.nemo", + description="This model is trained on a single male speaker data in Thorsten Müller’s German Neutral 22.10 Dataset sampled at 22050Hz and can be used to generate male German voices.", class_=cls, ) list_of_models.append(model) diff --git a/nemo/collections/tts/models/hifigan.py b/nemo/collections/tts/models/hifigan.py index b07896a03b1b..78654b9b4c30 100644 --- a/nemo/collections/tts/models/hifigan.py +++ b/nemo/collections/tts/models/hifigan.py @@ -381,13 +381,24 @@ def list_available_models(cls) -> 'Optional[Dict[str, str]]': ) list_of_models.append(model) - # de-DE, single speaker, 22050 Hz, OpenSLR Neutral German Dataset. + # de-DE, single male speaker, 22050 Hz, Thorsten Müller’s German Neutral-TTS Dataset, 21.02 model = PretrainedModelInfo( - pretrained_model_name="tts_de_slr_hifigan_ft_fastpitch_singlespeaker", - location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_hifigan.nemo", - description="This model is finetuned from the HiFiGAN pretrained checkpoint `tts_hifigan` " - "by the mel-spectrograms generated from the FastPitch checkpoint `tts_de_fastpitch_singlespeaker`. This model " - "has been tested on generating male German voices.", + pretrained_model_name="tts_de_hifigan_singleSpeaker_thorstenNeutral_2102", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_hifigan_thorstens2102.nemo", + description="This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_lj_hifigan_ft_mixerttsx`" + " by the mel-spectrograms generated from the FastPitch checkpoint `tts_de_fastpitch_singleSpeaker_thorstenNeutral_2102`." + " This model has been tested on generating male German neutral voices.", + class_=cls, + ) + list_of_models.append(model) + + # de-DE, single male speaker, 22050 Hz, Thorsten Müller’s German Neutral-TTS Dataset, 22.10 + model = PretrainedModelInfo( + pretrained_model_name="tts_de_hifigan_singleSpeaker_thorstenNeutral_2210", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.15.0/files/tts_de_hifigan_thorstens2210.nemo", + description="This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_lj_hifigan_ft_mixerttsx`" + " by the mel-spectrograms generated from the FastPitch checkpoint `tts_de_fastpitch_singleSpeaker_thorstenNeutral_2210`." + " This model has been tested on generating male German neutral voices.", class_=cls, ) list_of_models.append(model)