NVIDIA · XuesongYang · Feb 7, 2023 · Feb 6, 2023 · Feb 7, 2023
diff --git a/docs/source/tts/data/ngc_models_am.csv b/docs/source/tts/data/ngc_models_am.csv
@@ -9,4 +9,4 @@ en-US,tts_en_tacotron2,LJSpeech,22050Hz,1,ARPABET,nemo.collections.tts.models.ta
 de-DE,tts_de_fastpitch_multispeaker_5,HUI Audio Corpus German,44100Hz,5,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitch_multispeaker_5 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_fastpitch_multispeaker_5.nemo``
 de-DE,tts_de_fastpitch_singlespeaker,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,ARPABET,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_fastpitch_align.nemo``
 es,tts_es_fastpitch_multispeaker,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,IPA,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.15.0/files/tts_es_fastpitch_multispeaker.nemo``
-zh-CN	,tts_zh_fastpitch_sfspeech,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,pinyin,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_fastpitch_sfspeech.nemo``
+zh-CN,tts_zh_fastpitch_sfspeech,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,pinyin,nemo.collections.tts.models.fastpitch.FastPitchModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_fastpitch_sfspeech.nemo``
diff --git a/docs/source/tts/data/ngc_models_vocoder.csv b/docs/source/tts/data/ngc_models_vocoder.csv
@@ -10,4 +10,4 @@ en-US,tts_waveglow_268m,librosa.filters.mel,LJSpeech,22050Hz,1,nemo.collections.
 de-DE,tts_de_hui_hifigan_ft_fastpitch_multispeaker_5,FastPitch,HUI Audio Corpus German,44100Hz,5,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitch_multispeaker_5 <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitch_multispeaker_5>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitch_multispeaker_5/versions/1.11.0/files/tts_de_hui_hifigan_ft_fastpitch_multispeaker_5.nemo``
 de-DE,tts_de_slr_hifigan_ft_fastpitch_singlespeaker,FastPitch,Thorsten Müller (German Neutral-TTS dataset),22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_de_fastpitchhifigan <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_de_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_de_fastpitchhifigan/versions/1.10.0/files/tts_de_hifigan.nemo``
 es,tts_es_hifigan_ft_fastpitch_multispeaker,FastPitch,OpenSLR crowdsourced Latin American Spanish,44100Hz,174,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_es_multispeaker_fastpitchhifigan <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/tts_es_multispeaker_fastpitchhifigan>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_es_multispeaker_fastpitchhifigan/versions/1.15.0/files/tts_es_hifigan_ft_fastpitch_multispeaker.nemo``
-zh-CN	,tts_zh_hifigan_sfspeech,FastPitch,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_hifigan_sfspeech.nemo``
+zh-CN,tts_zh_hifigan_sfspeech,FastPitch,SFSpeech Chinese/English Bilingual Speech,22050Hz,1,nemo.collections.tts.models.hifigan.HifiGanModel,`tts_zh_fastpitch_hifigan_sfspeech <https://ngc.nvidia.com/catalog/models/nvidia:nemo:tts_zh_fastpitch_hifigan_sfspeech>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_hifigan_sfspeech.nemo``
diff --git a/nemo/collections/tts/models/fastpitch.py b/nemo/collections/tts/models/fastpitch.py
@@ -612,11 +612,14 @@ def list_available_models(cls) -> 'List[PretrainedModelInfo]':
         )
         list_of_models.append(model)
 
-        # zh, single speaker, 22050Hz, SFSpeech Bilingual Chinese/English dataset
+        # zh, single female speaker, 22050Hz, SFSpeech Bilingual Chinese/English dataset, improved model using richer
+        # dict and jieba word segmenter for polyphone disambiguation.
         model = PretrainedModelInfo(
             pretrained_model_name="tts_zh_fastpitch_sfspeech",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_fastpitch_sfspeech.nemo",
-            description="This model is trained on a single female speaker in SFSpeech Bilingual Chinese/English dataset sampled at 22050Hz and can be used to generate female Mandarin Chinese voices.",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_fastpitch_sfspeech.nemo",
+            description="This model is trained on a single female speaker in SFSpeech Bilingual Chinese/English dataset"
+            " sampled at 22050Hz and can be used to generate female Mandarin Chinese voices. It is improved"
+            " using richer dict and jieba word segmenter for polyphone disambiguation.",
             class_=cls,
         )
         list_of_models.append(model)

diff --git a/nemo/collections/tts/models/hifigan.py b/nemo/collections/tts/models/hifigan.py
@@ -415,16 +415,18 @@ def list_available_models(cls) -> 'Optional[Dict[str, str]]':
         )
         list_of_models.append(model)
 
-        # zh, single female speaker, 22050 Hz, SFSpeech Chinese/English Bilingual Dataset.
+        # zh, single female speaker, 22050Hz, SFSpeech Bilingual Chinese/English dataset, improved model using richer
+        # dict and jieba word segmenter for polyphone disambiguation.
         model = PretrainedModelInfo(
             pretrained_model_name="tts_zh_hifigan_sfspeech",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.14.0/files/tts_zh_hifigan_sfspeech.nemo",
-            description="This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_hifitts_hifigan_ft_fastpitch` "
-            "by the mel-spectrograms generated from the FastPitch checkpoint `tts_zh_fastpitch_sfspeech`. This model "
-            "has been tested on generating female Mandarin Chinese voices.",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/tts_zh_fastpitch_hifigan_sfspeech/versions/1.15.0/files/tts_zh_hifigan_sfspeech.nemo",
+            description="This model is finetuned from the HiFiGAN pretrained checkpoint `tts_en_lj_hifigan_ft_mixerttsx`"
+            " by the mel-spectrograms generated from the FastPitch checkpoint `tts_zh_fastpitch_sfspeech`."
+            " This model has been tested on generating female Mandarin Chinese voices.",
             class_=cls,
         )
         list_of_models.append(model)
+
         return list_of_models
 
     def load_state_dict(self, state_dict, strict=True):