diff --git a/README.rst b/README.rst index 86f4b5a9a32f..1a082648fcfb 100644 --- a/README.rst +++ b/README.rst @@ -98,7 +98,7 @@ Key Features * `Speech synthesis (TTS) `_ * Spectrogram generation: Tacotron2, GlowTTS, TalkNet, FastPitch, FastSpeech2, Mixer-TTS, Mixer-TTS-X * Vocoders: WaveGlow, SqueezeWave, UniGlow, MelGAN, HiFiGAN, UnivNet - * End-to-end speech generation: FastPitch_HifiGan_E2E, FastSpeech2_HifiGan_E2E + * End-to-end speech generation: FastPitch_HifiGan_E2E, FastSpeech2_HifiGan_E2E, VITS * `NGC collection of pre-trained TTS models. `_ * `Tools `_ * `Text Processing (text normalization and inverse text normalization) `_ diff --git a/docs/source/tts/checkpoints.rst b/docs/source/tts/checkpoints.rst index 46ad0427c08e..f78cf6a2805e 100644 --- a/docs/source/tts/checkpoints.rst +++ b/docs/source/tts/checkpoints.rst @@ -144,4 +144,10 @@ Vocoders .. csv-table:: :file: data/ngc_models_vocoder.csv :align: left + :header-rows: 1 +End2End models +^^^^^^^^ +.. csv-table:: + :file: data/ngc_models_e2e.csv + :align: left :header-rows: 1 \ No newline at end of file diff --git a/docs/source/tts/data/ngc_models_e2e.csv b/docs/source/tts/data/ngc_models_e2e.csv new file mode 100644 index 000000000000..63d3a27dae0e --- /dev/null +++ b/docs/source/tts/data/ngc_models_e2e.csv @@ -0,0 +1,2 @@ +Locale,Model Name,Dataset,Sampling Rate,#Spk,Phoneme Unit,Model Class,Overview,Checkpoint +en-US,tts_en_lj_vits,LJSpeech,22050Hz,1,IPA,nemo.collections.tts.models.vits.VitsModel,`tts_en_lj_vits `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_vits/versions/1.13.0/files/vits_ljspeech_fp16_full.nemo`` \ No newline at end of file diff --git a/nemo/collections/tts/models/vits.py b/nemo/collections/tts/models/vits.py index d035c6a1b3ac..a18c762803e2 100644 --- a/nemo/collections/tts/models/vits.py +++ b/nemo/collections/tts/models/vits.py @@ -372,7 +372,14 @@ def setup_test_data(self, cfg): @classmethod def list_available_models(cls) -> 'List[PretrainedModelInfo]': list_of_models = [] - # TODO: List available models?? + model = PretrainedModelInfo( + pretrained_model_name="tts_en_lj_vits", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_en_lj_vits/versions/1.13.0/files/vits_ljspeech_fp16_full.nemo", + description="This model is trained on LJSpeech audio sampled at 22050Hz. This model has been tested on generating female English " + "voices with an American accent.", + class_=cls, + ) + list_of_models.append(model) return list_of_models @typecheck(