diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb deleted file mode 100644 index 40b959d680..0000000000 --- a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb +++ /dev/null @@ -1,401 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6LWsNd3_M3MP" - }, - "source": [ - "# Mozilla TTS on CPU Real-Time Speech Synthesis " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "FAqrSIWgLyP0" - }, - "source": [ - "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", - "\n", - "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", - "\n", - "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", - "\n", - "Note that both model performances can be improved with more training." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ku-dA4DKoeXk" - }, - "source": [ - "### Download Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 162 - }, - "colab_type": "code", - "id": "jGIgnWhGsxU1", - "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe", - "tags": [] - }, - "outputs": [], - "source": [ - "! mkdir data/\n", - "! gdown --id 1SYpv7V__QYDjKXa_vJmNXo1CSkcoZovy -O data/tts_model.pth.tar\n", - "! gdown --id 14BIvfJXnFHi3jcxYNX40__TR6RwJOZqi -O data/tts_config.json\n", - "! gdown --id 1ECRlXybT6rAWp269CkhjUPwcZ10CkcqD -O data/tts_scale_stats.npy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 235 - }, - "colab_type": "code", - "id": "4dnpE0-kvTsu", - "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e", - "tags": [] - }, - "outputs": [], - "source": [ - "! gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n", - "! gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/vocoder_config.json\n", - "! gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/vocoder_scale_stats.npy" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Zlgi8fPdpRF0" - }, - "source": [ - "### Define TTS function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "f-Yc42nQZG5A" - }, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, style_wav=None):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=style_wav,\n", - " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", - " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n", - " if not use_gl:\n", - " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", - " waveform = waveform.flatten()\n", - " if use_cuda:\n", - " waveform = waveform.cpu()\n", - " waveform = waveform.numpy()\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " tps = (time.time() - t_1) / len(waveform)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " print(\" > Time per step: {}\".format(tps))\n", - " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ZksegYQepkFg" - }, - "source": [ - "### Load Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "oVa0kOamprgj" - }, - "outputs": [], - "source": [ - "import os\n", - "import torch\n", - "import time\n", - "import IPython\n", - "\n", - "from TTS.tts.utils.generic_utils import setup_model\n", - "from TTS.utils.io import load_config\n", - "from TTS.tts.utils.text.symbols import symbols, phonemes, make_symbols\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.tts.utils.synthesis import synthesis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "EY-sHVO8IFSH" - }, - "outputs": [], - "source": [ - "# runtime settings\n", - "use_cuda = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_1aIUp2FpxOQ" - }, - "outputs": [], - "source": [ - "# model paths\n", - "TTS_MODEL = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/checkpoint_17000.pth.tar\"\n", - "TTS_CONFIG = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/config.json\"\n", - "\n", - "TTS_MODEL = \"data/tts_model.pth.tar\"\n", - "TTS_CONFIG = \"data/tts_config.json\"\n", - "\n", - "VOCODER_MODEL = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar\"\n", - "VOCODER_CONFIG = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/config.json\"\n", - "\n", - "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n", - "VOCODER_CONFIG = \"data/vocoder_config.json\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CpgmdBVQplbv" - }, - "outputs": [], - "source": [ - "# load configs\n", - "TTS_CONFIG = load_config(TTS_CONFIG)\n", - "VOCODER_CONFIG = load_config(VOCODER_CONFIG)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 471 - }, - "colab_type": "code", - "id": "zmrQxiozIUVE", - "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49", - "tags": [] - }, - "outputs": [], - "source": [ - "# load the audio processor\n", - "TTS_CONFIG.audio['stats_path'] = 'data/tts_scale_stats.npy'\n", - "ap = AudioProcessor(**TTS_CONFIG.audio) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "colab_type": "code", - "id": "8fLoI4ipqMeS", - "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382", - "tags": [] - }, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "# multi speaker \n", - "speaker_id = None\n", - "speakers = []\n", - "\n", - "# load the model (chinese_mandarin special characters/punctuations are in the tts_config.json)\n", - "if TTS_CONFIG.get(\"characters\"):\n", - " _characters = TTS_CONFIG[\"characters\"][\"characters\"]\n", - " _phonemes = TTS_CONFIG[\"characters\"][\"phonemes\"]\n", - " _punctuations = TTS_CONFIG[\"characters\"][\"punctuations\"]\n", - " _pad = TTS_CONFIG[\"characters\"][\"pad\"]\n", - " _eos = TTS_CONFIG[\"characters\"][\"eos\"]\n", - " _bos = TTS_CONFIG[\"characters\"][\"bos\"]\n", - " \n", - " symbols, phonemes = make_symbols(_characters, _phonemes, punctuations= _punctuations, pad=_pad, eos=_eos, bos=_bos )\n", - "\n", - "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", - "\n", - "# load model state\n", - "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n", - "\n", - "# load the model\n", - "model.load_state_dict(cp['model'])\n", - "if use_cuda:\n", - " model.cuda()\n", - "model.eval()\n", - "\n", - "# set model stepsize\n", - "if 'r' in cp:\n", - " model.decoder.set_r(cp['r'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "id": "zKoq0GgzqzhQ", - "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb", - "tags": [] - }, - "outputs": [], - "source": [ - "from TTS.vocoder.utils.generic_utils import setup_generator\n", - "\n", - "# LOAD VOCODER MODEL\n", - "vocoder_model = setup_generator(VOCODER_CONFIG)\n", - "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n", - "vocoder_model.remove_weight_norm()\n", - "vocoder_model.inference_padding = 0\n", - "\n", - "\n", - "VOCODER_CONFIG.audio['stats_path'] = 'data/vocoder_scale_stats.npy'\n", - "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", - "if use_cuda:\n", - " vocoder_model.cuda()\n", - "vocoder_model.eval()\n", - "print(\"\\nVocoder loaded\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ws_YkPKsLgo-" - }, - "source": [ - "## Run Inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Here some test sentences for you to play with :\n", - "sentence = \"我从来不会说很标准的中文。\"\n", - "sentence = \"我喜欢听人工智能的博客。\"\n", - "sentence = \"我来自一个法国郊区的地方。\"\n", - "sentence = \"不比不知道,一比吓一跳!\"\n", - "sentence = \"台湾是一个真的很好玩的地方!\"\n", - "sentence = \"干一行,行一行,行行都行。\"\n", - "sentence = \"我要盖被子,好尴尬!\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# You can also play with the style_wav global style token. However, the lady speaking in the baker dataset\n", - "# has no emotion through all the sentences. It's hard to get some nice GST with this.\n", - "# That's also why adding \"!\" or \"?\" at the end of sentence change nothing. The dataset has no such prosody.\n", - "style_wav = {\"2\": 0.3, \"1\": -0.1}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 134 - }, - "colab_type": "code", - "id": "FuWxZ9Ey5Puj", - "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91", - "tags": [] - }, - "outputs": [], - "source": [ - "sentence = \"我喜欢听人工智能的博客。\"\n", - "style_wav = {\"2\": 0.2, \"7\": -0.1}\n", - "\n", - "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True, style_wav= style_wav)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb new file mode 100644 index 0000000000..1be93a82d2 --- /dev/null +++ b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example_Synthetizer.ipynb @@ -0,0 +1,606 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6LWsNd3_M3MP" + }, + "source": [ + "# Mozilla TTS on CPU Real-Time Speech Synthesis " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "FAqrSIWgLyP0" + }, + "source": [ + "We use Tacotron2 and MultiBand-Melgan models and Baker dataset (chinese mandarin).\n", + "\n", + "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 126K steps (3 days) with a single GPU.\n", + "\n", + "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", + "\n", + "Note that both model performances can be improved with more training." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ku-dA4DKoeXk" + }, + "source": [ + "### Download Models" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Zlgi8fPdpRF0" + }, + "source": [ + "### Define TTS function" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZksegYQepkFg" + }, + "source": [ + "### Load Models" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oVa0kOamprgj" + }, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "import IPython\n", + "\n", + "from TTS.utils.synthesizer import Synthesizer\n", + "from TTS.utils.manage import ModelManager\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "EY-sHVO8IFSH" + }, + "outputs": [], + "source": [ + "# runtime settings\n", + "use_cuda = False" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# tts and vocoder name\n", + "TTS_NAME = \"tts_models/zh-CN/baker/tacotron2-DDC-GST\"\n", + "VOCODER_NAME = \"vocoder_models/en/ljspeech/multiband-melgan\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "manager = ModelManager(\"../TTS/.models.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > tts_models/zh-CN/baker/tacotron2-DDC-GST is already downloaded.\n", + " > vocoder_models/en/ljspeech/multiband-melgan is already downloaded.\n" + ] + } + ], + "source": [ + "tts_checkpoint_file, tts_config_file, tts_json_dict = manager.download_model(TTS_NAME)\n", + "vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(VOCODER_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Using model: tacotron2\n", + " > Generator Model: multiband_melgan_generator\n" + ] + } + ], + "source": [ + "synthesizer = Synthesizer(tts_checkpoint_file, tts_config_file, vocoder_checkpoint_file, vocoder_config_file, use_cuda)\n", + "sample_rate = synthesizer.tts_config.audio[\"sample_rate\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ws_YkPKsLgo-" + }, + "source": [ + "## Run Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Here some test sentences for you to play with :\n", + "sentences= [\"我从来不会说很标准的中文。\",\n", + "\"我喜欢听人工智能的博客。\",\n", + "\"我来自一个法国郊区的地方。\",\n", + "\"不比不知道,一比吓一跳!\",\n", + "\"台湾是一个真的很好玩的地方!\",\n", + "\"干一行,行一行,行行都行。\",\n", + "\"我要盖被子,好尴尬!\",]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我从来不会说很标准的中文。']\n", + " > Processing time: 1.6665124893188477\n", + " > Real-time factor: 0.5583910829911347\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我喜欢听人工智能的博客。']\n", + " > Processing time: 1.4052538871765137\n", + " > Real-time factor: 0.5193391025114328\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我来自一个法国郊区的地方。']\n", + " > Processing time: 1.605910062789917\n", + " > Real-time factor: 0.5785999490934259\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['不比不知道,一比吓一跳!']\n", + " > Processing time: 1.9105627536773682\n", + " > Real-time factor: 0.6607262973429417\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['台湾是一个真的很好玩的地方!']\n", + " > Processing time: 1.3081049919128418\n", + " > Real-time factor: 0.4218891158389621\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['干一行,行一行,行行都行。']\n", + " > Processing time: 2.0958540439605713\n", + " > Real-time factor: 0.6709288860239634\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我要盖被子,好尴尬!']\n", + " > Processing time: 1.5188167095184326\n", + " > Real-time factor: 0.6257456734843319\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for sentence in sentences:\n", + " wav = synthesizer.tts(sentence)\n", + " IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我喜欢听人工智能的博客。']\n", + " > Processing time: 2.114016056060791\n", + " > Real-time factor: 0.643271887228699\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# you can also play with Global Style Token (GST) by feeding a \n", + "# ... wav_style parameter to the tts method\n", + "\n", + "style_wav = {\"2\": 0.2}\n", + "\n", + "wav = synthesizer.tts(sentences[1], style_wav=style_wav)\n", + "IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我喜欢听人工智能的博客。']\n", + " > Processing time: 1.5687272548675537\n", + " > Real-time factor: 0.6401842606201799\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我喜欢听人工智能的博客。']\n", + " > Processing time: 2.070594072341919\n", + " > Real-time factor: 0.8067677285683367\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我喜欢听人工智能的博客。']\n", + " > Processing time: 1.3769311904907227\n", + " > Real-time factor: 0.5088718951180015\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我喜欢听人工智能的博客。']\n", + " > Processing time: 2.024374485015869\n", + " > Real-time factor: 0.6782983435843654\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Text splitted to sentences.\n", + "['我喜欢听人工智能的博客。']\n", + " > Processing time: 2.4434399604797363\n", + " > Real-time factor: 0.7435119663360867\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# On this model specifically, we can observe that the GSToken \"2\" is responsible for speech speed\n", + "# You can listen to these 5 different samples, the flow is slower and slower as the value is higher\n", + "for value in [-0.2, -0.1, 0, 0.1, 0.2]:\n", + " style_wav = {\"2\": value}\n", + " wav = synthesizer.tts(sentences[1], style_wav=style_wav)\n", + " IPython.display.display(IPython.display.Audio(wav, rate=sample_rate)) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}