diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb
deleted file mode 100644
index 40b959d680..0000000000
--- a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb
+++ /dev/null
@@ -1,401 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "6LWsNd3_M3MP"
- },
- "source": [
- "# Mozilla TTS on CPU Real-Time Speech Synthesis "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "FAqrSIWgLyP0"
- },
- "source": [
- "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
- "\n",
- "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
- "\n",
- "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
- "\n",
- "Note that both model performances can be improved with more training."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Ku-dA4DKoeXk"
- },
- "source": [
- "### Download Models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 162
- },
- "colab_type": "code",
- "id": "jGIgnWhGsxU1",
- "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
- "tags": []
- },
- "outputs": [],
- "source": [
- "! mkdir data/\n",
- "! gdown --id 1SYpv7V__QYDjKXa_vJmNXo1CSkcoZovy -O data/tts_model.pth.tar\n",
- "! gdown --id 14BIvfJXnFHi3jcxYNX40__TR6RwJOZqi -O data/tts_config.json\n",
- "! gdown --id 1ECRlXybT6rAWp269CkhjUPwcZ10CkcqD -O data/tts_scale_stats.npy"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 235
- },
- "colab_type": "code",
- "id": "4dnpE0-kvTsu",
- "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
- "tags": []
- },
- "outputs": [],
- "source": [
- "! gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n",
- "! gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/vocoder_config.json\n",
- "! gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/vocoder_scale_stats.npy"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Zlgi8fPdpRF0"
- },
- "source": [
- "### Define TTS function"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "f-Yc42nQZG5A"
- },
- "outputs": [],
- "source": [
- "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, style_wav=None):\n",
- " t_1 = time.time()\n",
- " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=style_wav,\n",
- " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
- " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
- " if not use_gl:\n",
- " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
- " waveform = waveform.flatten()\n",
- " if use_cuda:\n",
- " waveform = waveform.cpu()\n",
- " waveform = waveform.numpy()\n",
- " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
- " tps = (time.time() - t_1) / len(waveform)\n",
- " print(waveform.shape)\n",
- " print(\" > Run-time: {}\".format(time.time() - t_1))\n",
- " print(\" > Real-time factor: {}\".format(rtf))\n",
- " print(\" > Time per step: {}\".format(tps))\n",
- " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
- " return alignment, mel_postnet_spec, stop_tokens, waveform"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "ZksegYQepkFg"
- },
- "source": [
- "### Load Models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "oVa0kOamprgj"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import torch\n",
- "import time\n",
- "import IPython\n",
- "\n",
- "from TTS.tts.utils.generic_utils import setup_model\n",
- "from TTS.utils.io import load_config\n",
- "from TTS.tts.utils.text.symbols import symbols, phonemes, make_symbols\n",
- "from TTS.utils.audio import AudioProcessor\n",
- "from TTS.tts.utils.synthesis import synthesis"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "EY-sHVO8IFSH"
- },
- "outputs": [],
- "source": [
- "# runtime settings\n",
- "use_cuda = False"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "_1aIUp2FpxOQ"
- },
- "outputs": [],
- "source": [
- "# model paths\n",
- "TTS_MODEL = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/checkpoint_17000.pth.tar\"\n",
- "TTS_CONFIG = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/config.json\"\n",
- "\n",
- "TTS_MODEL = \"data/tts_model.pth.tar\"\n",
- "TTS_CONFIG = \"data/tts_config.json\"\n",
- "\n",
- "VOCODER_MODEL = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar\"\n",
- "VOCODER_CONFIG = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/config.json\"\n",
- "\n",
- "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
- "VOCODER_CONFIG = \"data/vocoder_config.json\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {},
- "colab_type": "code",
- "id": "CpgmdBVQplbv"
- },
- "outputs": [],
- "source": [
- "# load configs\n",
- "TTS_CONFIG = load_config(TTS_CONFIG)\n",
- "VOCODER_CONFIG = load_config(VOCODER_CONFIG)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 471
- },
- "colab_type": "code",
- "id": "zmrQxiozIUVE",
- "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
- "tags": []
- },
- "outputs": [],
- "source": [
- "# load the audio processor\n",
- "TTS_CONFIG.audio['stats_path'] = 'data/tts_scale_stats.npy'\n",
- "ap = AudioProcessor(**TTS_CONFIG.audio) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 35
- },
- "colab_type": "code",
- "id": "8fLoI4ipqMeS",
- "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
- "tags": []
- },
- "outputs": [],
- "source": [
- "# LOAD TTS MODEL\n",
- "# multi speaker \n",
- "speaker_id = None\n",
- "speakers = []\n",
- "\n",
- "# load the model (chinese_mandarin special characters/punctuations are in the tts_config.json)\n",
- "if TTS_CONFIG.get(\"characters\"):\n",
- " _characters = TTS_CONFIG[\"characters\"][\"characters\"]\n",
- " _phonemes = TTS_CONFIG[\"characters\"][\"phonemes\"]\n",
- " _punctuations = TTS_CONFIG[\"characters\"][\"punctuations\"]\n",
- " _pad = TTS_CONFIG[\"characters\"][\"pad\"]\n",
- " _eos = TTS_CONFIG[\"characters\"][\"eos\"]\n",
- " _bos = TTS_CONFIG[\"characters\"][\"bos\"]\n",
- " \n",
- " symbols, phonemes = make_symbols(_characters, _phonemes, punctuations= _punctuations, pad=_pad, eos=_eos, bos=_bos )\n",
- "\n",
- "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
- "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
- "\n",
- "# load model state\n",
- "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
- "\n",
- "# load the model\n",
- "model.load_state_dict(cp['model'])\n",
- "if use_cuda:\n",
- " model.cuda()\n",
- "model.eval()\n",
- "\n",
- "# set model stepsize\n",
- "if 'r' in cp:\n",
- " model.decoder.set_r(cp['r'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 1000
- },
- "colab_type": "code",
- "id": "zKoq0GgzqzhQ",
- "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
- "tags": []
- },
- "outputs": [],
- "source": [
- "from TTS.vocoder.utils.generic_utils import setup_generator\n",
- "\n",
- "# LOAD VOCODER MODEL\n",
- "vocoder_model = setup_generator(VOCODER_CONFIG)\n",
- "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
- "vocoder_model.remove_weight_norm()\n",
- "vocoder_model.inference_padding = 0\n",
- "\n",
- "\n",
- "VOCODER_CONFIG.audio['stats_path'] = 'data/vocoder_scale_stats.npy'\n",
- "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
- "if use_cuda:\n",
- " vocoder_model.cuda()\n",
- "vocoder_model.eval()\n",
- "print(\"\\nVocoder loaded\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "Ws_YkPKsLgo-"
- },
- "source": [
- "## Run Inference"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Here some test sentences for you to play with :\n",
- "sentence = \"我从来不会说很标准的中文。\"\n",
- "sentence = \"我喜欢听人工智能的博客。\"\n",
- "sentence = \"我来自一个法国郊区的地方。\"\n",
- "sentence = \"不比不知道,一比吓一跳!\"\n",
- "sentence = \"台湾是一个真的很好玩的地方!\"\n",
- "sentence = \"干一行,行一行,行行都行。\"\n",
- "sentence = \"我要盖被子,好尴尬!\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# You can also play with the style_wav global style token. However, the lady speaking in the baker dataset\n",
- "# has no emotion through all the sentences. It's hard to get some nice GST with this.\n",
- "# That's also why adding \"!\" or \"?\" at the end of sentence change nothing. The dataset has no such prosody.\n",
- "style_wav = {\"2\": 0.3, \"1\": -0.1}\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 134
- },
- "colab_type": "code",
- "id": "FuWxZ9Ey5Puj",
- "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
- "tags": []
- },
- "outputs": [],
- "source": [
- "sentence = \"我喜欢听人工智能的博客。\"\n",
- "style_wav = {\"2\": 0.2, \"7\": -0.1}\n",
- "\n",
- "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True, style_wav= style_wav)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "accelerator": "GPU",
- "colab": {
- "collapsed_sections": [],
- "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
- "provenance": [],
- "toc_visible": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb
new file mode 100644
index 0000000000..1be93a82d2
--- /dev/null
+++ b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb
@@ -0,0 +1,606 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "6LWsNd3_M3MP"
+ },
+ "source": [
+ "# Mozilla TTS on CPU Real-Time Speech Synthesis "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "FAqrSIWgLyP0"
+ },
+ "source": [
+ "We use Tacotron2 and MultiBand-Melgan models and Baker dataset (chinese mandarin).\n",
+ "\n",
+ "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 126K steps (3 days) with a single GPU.\n",
+ "\n",
+ "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
+ "\n",
+ "Note that both model performances can be improved with more training."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Ku-dA4DKoeXk"
+ },
+ "source": [
+ "### Download Models"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Zlgi8fPdpRF0"
+ },
+ "source": [
+ "### Define TTS function"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "ZksegYQepkFg"
+ },
+ "source": [
+ "### Load Models"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "oVa0kOamprgj"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "import IPython\n",
+ "\n",
+ "from TTS.utils.synthesizer import Synthesizer\n",
+ "from TTS.utils.manage import ModelManager\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "EY-sHVO8IFSH"
+ },
+ "outputs": [],
+ "source": [
+ "# runtime settings\n",
+ "use_cuda = False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# tts and vocoder name\n",
+ "TTS_NAME = \"tts_models/zh-CN/baker/tacotron2-DDC-GST\"\n",
+ "VOCODER_NAME = \"vocoder_models/en/ljspeech/multiband-melgan\"\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "manager = ModelManager(\"../TTS/.models.json\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > tts_models/zh-CN/baker/tacotron2-DDC-GST is already downloaded.\n",
+ " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n"
+ ]
+ }
+ ],
+ "source": [
+ "tts_checkpoint_file, tts_config_file, tts_json_dict = manager.download_model(TTS_NAME)\n",
+ "vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(VOCODER_NAME)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Using model: tacotron2\n",
+ " > Generator Model: multiband_melgan_generator\n"
+ ]
+ }
+ ],
+ "source": [
+ "synthesizer = Synthesizer(tts_checkpoint_file, tts_config_file, vocoder_checkpoint_file, vocoder_config_file, use_cuda)\n",
+ "sample_rate = synthesizer.tts_config.audio[\"sample_rate\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Ws_YkPKsLgo-"
+ },
+ "source": [
+ "## Run Inference"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Here some test sentences for you to play with :\n",
+ "sentences= [\"我从来不会说很标准的中文。\",\n",
+ "\"我喜欢听人工智能的博客。\",\n",
+ "\"我来自一个法国郊区的地方。\",\n",
+ "\"不比不知道,一比吓一跳!\",\n",
+ "\"台湾是一个真的很好玩的地方!\",\n",
+ "\"干一行,行一行,行行都行。\",\n",
+ "\"我要盖被子,好尴尬!\",]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我从来不会说很标准的中文。']\n",
+ " > Processing time: 1.6665124893188477\n",
+ " > Real-time factor: 0.5583910829911347\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我喜欢听人工智能的博客。']\n",
+ " > Processing time: 1.4052538871765137\n",
+ " > Real-time factor: 0.5193391025114328\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我来自一个法国郊区的地方。']\n",
+ " > Processing time: 1.605910062789917\n",
+ " > Real-time factor: 0.5785999490934259\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['不比不知道,一比吓一跳!']\n",
+ " > Processing time: 1.9105627536773682\n",
+ " > Real-time factor: 0.6607262973429417\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['台湾是一个真的很好玩的地方!']\n",
+ " > Processing time: 1.3081049919128418\n",
+ " > Real-time factor: 0.4218891158389621\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['干一行,行一行,行行都行。']\n",
+ " > Processing time: 2.0958540439605713\n",
+ " > Real-time factor: 0.6709288860239634\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我要盖被子,好尴尬!']\n",
+ " > Processing time: 1.5188167095184326\n",
+ " > Real-time factor: 0.6257456734843319\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "for sentence in sentences:\n",
+ " wav = synthesizer.tts(sentence)\n",
+ " IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我喜欢听人工智能的博客。']\n",
+ " > Processing time: 2.114016056060791\n",
+ " > Real-time factor: 0.643271887228699\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# you can also play with Global Style Token (GST) by feeding a \n",
+ "# ... wav_style parameter to the tts method\n",
+ "\n",
+ "style_wav = {\"2\": 0.2}\n",
+ "\n",
+ "wav = synthesizer.tts(sentences[1], style_wav=style_wav)\n",
+ "IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我喜欢听人工智能的博客。']\n",
+ " > Processing time: 1.5687272548675537\n",
+ " > Real-time factor: 0.6401842606201799\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我喜欢听人工智能的博客。']\n",
+ " > Processing time: 2.070594072341919\n",
+ " > Real-time factor: 0.8067677285683367\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我喜欢听人工智能的博客。']\n",
+ " > Processing time: 1.3769311904907227\n",
+ " > Real-time factor: 0.5088718951180015\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我喜欢听人工智能的博客。']\n",
+ " > Processing time: 2.024374485015869\n",
+ " > Real-time factor: 0.6782983435843654\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "['我喜欢听人工智能的博客。']\n",
+ " > Processing time: 2.4434399604797363\n",
+ " > Real-time factor: 0.7435119663360867\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# On this model specifically, we can observe that the GSToken \"2\" is responsible for speech speed\n",
+ "# You can listen to these 5 different samples, the flow is slower and slower as the value is higher\n",
+ "for value in [-0.2, -0.1, 0, 0.1, 0.2]:\n",
+ " style_wav = {\"2\": value}\n",
+ " wav = synthesizer.tts(sentences[1], style_wav=style_wav)\n",
+ " IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "collapsed_sections": [],
+ "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}