diff --git a/.gitignore b/.gitignore index 9401d2ebb..944558088 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,15 @@ *.bcf *.toc *.wav -*.sh +datasets/* encoder/saved_models/* synthesizer/saved_models/* vocoder/saved_models/* +*.bak +*.gz +LibriSpeech/* +*.txt +*.TXT +*.flac +*.mp3 +*.zip diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..58738f6f2 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "terminal.integrated.shell.windows": "C:\\Windows\\System32\\cmd.exe", + "terminal.integrated.shellArgs.windows": [ + "/k", + "%userprofile%/miniconda3/Scripts/activate base" + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 21bee69d4..6074d5f23 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,9 @@ # Real-Time Voice Cloning -This repository is an implementation of [Transfer Learning from Speaker Verification to -Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. Feel free to check [my thesis](https://matheo.uliege.be/handle/2268.2/6801) if you're curious or if you're looking for info I haven't documented yet (don't hesitate to make an issue for that too). Mostly I would recommend giving a quick look to the figures beyond the introduction. +This repository is an implementation of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. Feel free to check [my thesis](https://matheo.uliege.be/handle/2268.2/6801) if you're curious, or if you're looking for info I haven't documented yet. Mostly I would recommend giving a quick look to the figures beyond the introduction. -SV2TTS is a three-stage deep learning framework that allows to create a numerical representation of a voice from a few seconds of audio, and to use it to condition a text-to-speech model trained to generalize to new voices. - -**Video demonstration** (click the picture): +SV2TTS is a three-stage deep learning framework that allows the creation of a numerical representation of a voice from a few seconds of audio, then use that data to condition a text-to-speech model trained to generate new voices. +**Video demonstration** (click the play button): [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) @@ -18,47 +16,48 @@ SV2TTS is a three-stage deep learning framework that allows to create a numerica |[1712.05884](https://arxiv.org/pdf/1712.05884.pdf) | Tacotron 2 (synthesizer) | Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions | [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2) |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo | -## News -**13/11/19**: I'm sorry that I can't maintain this repo as much as I wish I could. I'm working full time on improving voice cloning techniques and I don't have the time to share my improvements here. Plus this repo relies on a lot of old tensorflow code and it's hard to work with. If you're a researcher, then this repo might be of use to you. **If you just want to clone your voice**, do check our demo on [Resemble.AI](https://www.resemble.ai/) - it will give much better results than this repo and will not require a complex setup. -**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder. You can use your trained encoder models from this repo with it. +## Get Started +### Requirements +Please use the setup.sh or setup.bat if you're on linux and windows respectively to install the dependancies, and requirements. Currently only python 3.7.x is supported. -**06/07/19:** Need to run within a docker container on a remote server? See [here](https://sean.lane.sh/posts/2019/07/Running-the-Real-Time-Voice-Cloning-project-in-Docker/). +* Windows Install Requirements + * During python installation, make sure python is added to path during installation. + * During conda installation, make sure you install it 'just for me'. + * During ms build tools installation, you only need to install the c++ package, which requires around 4.7GB. Upon installation of build tools, you'll need to restart the computer to complete the install process. Rerun the setup.bat to finish the setup process. -**25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM. +#### Install Manually: +You will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1) installed first, then run `pip install -r requirements.txt` to install the necessary packages. +### After install Steps +Next you will need [pretrained models](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models) if you don't plan to train your own. +These models were trained on a cuda device, so they'll produce finicky results for a cpu. New CPU models will need to be produced first. (As of 5/1/20) +Download the models, and uncompress them in this root folder. If done correctly, it should result as `/encoder/saved_models`, `/synthesizer/saved_models`, and `/vocoder/saved_models`. -## Quick start -### Requirements -You will need the following whether you plan to use the toolbox only or to retrain the models. +### Test installation +When you believe you have all the neccesary soup, test the program by running `python demo_cli.py`. +If all tests pass, you're good to go. To use the cpu, use the option `--cpu`. -**Python 3.7**. Python 3.6 might work too, but I wouldn't go lower because I make extensive use of pathlib. +### Generate Audio from dataset +There are a few preconfigured options for datasets. One in perticular, [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz) is made to work from demo_toolbox.py. When you download this dataset, you can locate the directory anywhere, but creating a folder in this directory named `datasets` is recommended. (All scripts will use this directory as default) -Run `pip install -r requirements.txt` to install the necessary packages. Additionally you will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1). +To run the toolbox, use `python demo_toolbox.py` if you followed the recommendation for the datasets directory location. Otherwise, include the full path to the dataset and use the option `-d`. -A GPU is mandatory, but you don't necessarily need a high tier GPU if you only want to use the toolbox. +To set the speaker, you'll need an input audio file. use browse in the toolbox to your personal audio file, or record to set your own voice. -### Pretrained models -Download the latest [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). +The toolbox supports other datasets, including [dev-train](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). -### Preliminary -Before you download any dataset, you can begin by testing your configuration with: +If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). -`python demo_cli.py` +## Contributions & Issues -If all tests pass, you're good to go. -### Datasets -For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox. -### Toolbox -You can then try the toolbox: +## Original Author CorentinJ News +**13/11/19**: I'm sorry that I can't maintain this repo as much as I wish I could. I'm working full time as of June 2019 on improving voice cloning techniques and I don't have the time to share my improvements here. Plus this repo relies on a lot of old tensorflow code and it's hard to work with. If you're a researcher, then this repo might be of use to you. **If you just want to clone your voice**, do check our demo on [Resemble.AI](https://www.resemble.ai/) - it will give much better results than this repo and will not require a complex setup. -`python demo_toolbox.py -d ` -or -`python demo_toolbox.py` +**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder. You can use your trained encoder models from this repo with it. -depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). +**06/07/19:** Need to run within a docker container on a remote server? See [here](https://sean.lane.sh/posts/2019/07/Running-the-Real-Time-Voice-Cloning-project-in-Docker/). -## Contributions & Issues -I'm working full-time as of June 2019. I don't have time to maintain this repo nor reply to issues. Sorry. +**25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM. \ No newline at end of file diff --git a/demo_cli.py b/demo_cli.py index 57bb001c0..0bc9c0e26 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -5,6 +5,7 @@ from vocoder import inference as vocoder from pathlib import Path import numpy as np +import soundfile as sf import librosa import argparse import torch @@ -30,6 +31,8 @@ "overhead but allows to save some GPU memory for lower-end GPUs.") parser.add_argument("--no_sound", action="store_true", help=\ "If True, audio won't be played.") + parser.add_argument( + '--cpu', help='Use CPU.', action='store_true') args = parser.parse_args() print_args(args, parser) if not args.no_sound: @@ -38,22 +41,25 @@ ## Print some environment information (for debugging purposes) print("Running a test of your configuration...\n") - if not torch.cuda.is_available(): - print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " + if args.cpu: + print("Using CPU for inference.") + elif torch.cuda.is_available(): + device_id = torch.cuda.current_device() + gpu_properties = torch.cuda.get_device_properties(device_id) + print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " + "%.1fGb total memory.\n" % + (torch.cuda.device_count(), + device_id, + gpu_properties.name, + gpu_properties.major, + gpu_properties.minor, + gpu_properties.total_memory / 1e9)) + else: + print("Your PyTorch installation is not configured. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " - "CUDA version matches your PyTorch installation. CPU-only inference is currently " - "not supported.", file=sys.stderr) + "CUDA version matches your PyTorch installation.", file=sys.stderr) + print("\nIf you're trying to use a cpu, please use the option --cpu.", file=sys.stderr) quit(-1) - device_id = torch.cuda.current_device() - gpu_properties = torch.cuda.get_device_properties(device_id) - print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " - "%.1fGb total memory.\n" % - (torch.cuda.device_count(), - device_id, - gpu_properties.name, - gpu_properties.major, - gpu_properties.minor, - gpu_properties.total_memory / 1e9)) ## Load the models one by one. @@ -116,10 +122,10 @@ num_generated = 0 while True: try: - # Get the reference audio filepath + # Get the reference audio filepath message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ "wav, m4a, flac, ...):\n" - in_fpath = Path(input(message).replace("\"", "").replace("\'", "")) + in_fpath = input(str(message).replace("\"", '').replace("\'", '')) ## Computing the embedding @@ -172,15 +178,13 @@ sd.play(generated_wav, synthesizer.sample_rate) # Save it on the disk - fpath = "demo_output_%02d.wav" % num_generated + filename = "demo_output_%02d.wav" % num_generated print(generated_wav.dtype) - librosa.output.write_wav(fpath, generated_wav.astype(np.float32), - synthesizer.sample_rate) + sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate) num_generated += 1 - print("\nSaved output as %s\n\n" % fpath) + print("\nSaved output as %s\n\n" % filename) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n") - \ No newline at end of file diff --git a/demo_toolbox.py b/demo_toolbox.py index 485c1366d..8ec9772d4 100644 --- a/demo_toolbox.py +++ b/demo_toolbox.py @@ -10,12 +10,11 @@ formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - parser.add_argument("-d", "--datasets_root", type=Path, help= \ + parser.add_argument("-d", "--datasets_root", type=Path, default="./datasets/", help= \ "Path to the directory containing your datasets. See toolbox/__init__.py for a list of " "supported datasets. You can add your own data by created a directory named UserAudio " "in your datasets root. Supported formats are mp3, flac, wav and m4a. Each speaker should " - "be inside a directory, e.g. /UserAudio/speaker_01/audio_01.wav.", - default=None) + "be inside a directory, e.g. /UserAudio/speaker_01/audio_01.wav.") parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models", help="Directory containing saved encoder models") parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models", @@ -30,4 +29,3 @@ # Launch the toolbox print_args(args, parser) Toolbox(**vars(args)) - \ No newline at end of file diff --git a/encoder/data_objects/speaker_verification_dataset.py b/encoder/data_objects/speaker_verification_dataset.py index 77a6e05ea..c1ddd501c 100644 --- a/encoder/data_objects/speaker_verification_dataset.py +++ b/encoder/data_objects/speaker_verification_dataset.py @@ -53,4 +53,3 @@ def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler= def collate(self, speakers): return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) - \ No newline at end of file diff --git a/encoder/inference.py b/encoder/inference.py index 2447832ff..d769dd172 100644 --- a/encoder/inference.py +++ b/encoder/inference.py @@ -30,7 +30,7 @@ def load_model(weights_fpath: Path, device=None): elif isinstance(device, str): _device = torch.device(device) _model = SpeakerEncoder(_device, torch.device("cpu")) - checkpoint = torch.load(weights_fpath) + checkpoint = torch.load(weights_fpath, _device) _model.load_state_dict(checkpoint["model_state"]) _model.eval() print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) diff --git a/encoder/train.py b/encoder/train.py index 071af1b9c..616178b0d 100644 --- a/encoder/train.py +++ b/encoder/train.py @@ -7,11 +7,12 @@ import torch def sync(device: torch.device): - # FIXME - return # For correct profiling (cuda operations are async) if device.type == "cuda": torch.cuda.synchronize(device) + else: + torch.cpu.synchronize(device) + def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, @@ -30,7 +31,7 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, # hyperparameters) faster on the CPU. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # FIXME: currently, the gradient is None if loss_device is cuda - loss_device = torch.device("cpu") + loss_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Create the model and the optimizer model = SpeakerEncoder(device, loss_device) @@ -122,4 +123,3 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, }, backup_fpath) profiler.tick("Extras (visualizations, saving)") - \ No newline at end of file diff --git a/encoder/visualizations.py b/encoder/visualizations.py index 980c74f95..266fa3695 100644 --- a/encoder/visualizations.py +++ b/encoder/visualizations.py @@ -175,4 +175,3 @@ def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, def save(self): if not self.disabled: self.vis.save([self.env_name]) - \ No newline at end of file diff --git a/encoder_preprocess.py b/encoder_preprocess.py index f69f3200a..9fe0a6655 100644 --- a/encoder_preprocess.py +++ b/encoder_preprocess.py @@ -24,12 +24,12 @@ class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptio " -dev", formatter_class=MyFormatter ) - parser.add_argument("datasets_root", type=Path, help=\ + parser.add_argument('-d', "--datasets_root", type=Path, default='./datasets/', help=\ "Path to the directory containing your LibriSpeech/TTS and VoxCeleb datasets.") parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\ "Path to the output directory that will contain the mel spectrograms. If left out, " "defaults to /SV2TTS/encoder/") - parser.add_argument("-d", "--datasets", type=str, + parser.add_argument("-dt", "--datasets_type", type=str, default="librispeech_other,voxceleb1,voxceleb2", help=\ "Comma-separated list of the name of the datasets you want to preprocess. Only the train " "set of these datasets will be used. Possible names: librispeech_other, voxceleb1, " diff --git a/encoder_train.py b/encoder_train.py index b8740a894..b03de9649 100644 --- a/encoder_train.py +++ b/encoder_train.py @@ -14,7 +14,7 @@ "Name for this model instance. If a model state from the same run ID was previously " "saved, the training will restart from there. Pass -f to overwrite saved states and " "restart from scratch.") - parser.add_argument("clean_data_root", type=Path, help= \ + parser.add_argument("-d", "--clean_data_root", type=Path, default='./datasets/SV2TTS/encoder/', help= \ "Path to the output directory of encoder_preprocess.py. If you left the default " "output directory when preprocessing, it should be /SV2TTS/encoder/.") parser.add_argument("-m", "--models_dir", type=Path, default="encoder/saved_models/", help=\ @@ -44,4 +44,3 @@ # Run the training print_args(args, parser) train(**vars(args)) - \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4b54673fa..eb52196a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,34 @@ -tensorflow-gpu>=1.10.0,<=1.14.0 -umap-learn -visdom -webrtcvad -librosa>=0.5.1 -matplotlib>=2.0.2 +# python3.7.x (6,7) confirmed +# each portion of tensorflow is neeed +# core package is for RNN, cpu and gpu are for specific system speed-ups +tensorflow==1.15 +tensorflow-cpu==1.15 +tensorflow-gpu==1.15 + +# dependancies +unidecode +inflect numpy>=1.14.0 -scipy>=1.0.0 -tqdm +matplotlib>=2.0.2 +librosa>=0.5.1 +PySoundFile +multiprocess +webrtcvad sounddevice -Unidecode -inflect PyQt5 -multiprocess -numba +umap-learn +visdom + +## AMD CPU support in tensorflow 2.0 +#### win #### +# keras +# plaidml-keras plaidbench +#### linux #### +# tensorflow-rocm +# rocm-dkms + +## tested demo_cli.py and demo_toolbox.py +## Unused requirements +#scipy>=1.0.0 +#tqdm +#numba==0.48.0 diff --git a/setup.bat b/setup.bat new file mode 100644 index 000000000..248c3322e --- /dev/null +++ b/setup.bat @@ -0,0 +1,27 @@ +if not exist %userprofile%/Downloads/python-3.7.7-amd64.exe ( + curl https://www.python.org/ftp/python/3.7.7/python-3.7.7-amd64.exe -o %userprofile%/Downloads/python-3.7.7-amd64.exe + %userprofile%/Downloads/python-3.7.7-amd64.exe +) + +if not exist %userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe ( + curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o %userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe + %userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe +) + +if not exist %userprofile%/Downloads/vs_BuildTools.exe ( + curl https://download.visualstudio.microsoft.com/download/pr/5e397ebe-38b2-4e18-a187-ac313d07332a/00945fbb0a29f63183b70370043e249218249f83dbc82cd3b46c5646503f9e27/vs_BuildTools.exe -o %userprofile%/Downloads/vs_BuildTools.exe + %userprofile%/Downloads/vs_BuildTools.exe +) + +if not exist vocoder/saved_models ( + python -m pip install gdown + gdown https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc + python -c "import zipfile; zipfile.ZipFile('pretrained.zip').extractall()" + del pretrained.zip +) + +start cmd /k "%userprofile%/miniconda3/Scripts/activate base & conda install -y pytorch & exit" +cd /D "%~dp0" +pip install -r requirements.txt + +:: plaidml-setup \ No newline at end of file diff --git a/setup.sh b/setup.sh new file mode 100644 index 000000000..b54f4a614 --- /dev/null +++ b/setup.sh @@ -0,0 +1,43 @@ +conda_installed=$(conda list | grep 'conda: command not found') +if [ '$conda_installed' != '' ]; then + wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + chmod +x Miniconda3-latest-Linux-x86_64.sh + ./Miniconda3-latest-Linux-x86_64.sh + rm Miniconda3-latest-Linux-x86_64.sh +fi + +if [ ! -d "vocoder/saved_models" ]; then + python -m pip install gdown + gdown https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc + python -c "import zipfile; zipfile.ZipFile('pretrained.zip').extractall()" +fi + +conda install pytorch +sudo apt -y install libportaudio2 gcc libsndfile1 +python3.7 -m pip install -r requirements.txt + +## Future AMD setup (needs tensorflow api v2) +amd='FALSE' +if [ $amd == 'TRUE' ]; then + sudo apt update + sudo apt -y dist-upgrade + sudo apt install libnuma-dev + sudo reboot + + wget -q -O - https://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | sudo apt-key add - + echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list + sudo apt update + sudo apt install rocm-dkms + sudo usermod -a -G video $LOGNAME + echo 'ADD_EXTRA_GROUPS=1' | sudo tee -a /etc/adduser.conf + echo 'EXTRA_GROUPS=video' | sudo tee -a /etc/adduser.conf + sudo reboot +fi + +echo "Finished installation" + +## Possible fix for "This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem." +# sudo apt-get install libxkbcommon-x11-dev + +## Possible fix for webrtcvad failure +#sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip \ No newline at end of file diff --git a/synthesizer/feeder.py b/synthesizer/feeder.py index 6fc1b2022..b1acb3d54 100644 --- a/synthesizer/feeder.py +++ b/synthesizer/feeder.py @@ -70,22 +70,22 @@ def __init__(self, coordinator, metadata_filename, hparams): # Create placeholders for inputs and targets. Don"t specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ - tf.placeholder(tf.int32, shape=(None, None), name="inputs"), - tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"), - tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), + tf.compat.v1.placeholder(tf.int32, shape=(None, None), name="inputs"), + tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="input_lengths"), + tf.compat.v1.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name="mel_targets"), - tf.placeholder(tf.float32, shape=(None, None), name="token_targets"), - tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), - tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), + tf.compat.v1.placeholder(tf.float32, shape=(None, None), name="token_targets"), + tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), + tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos"), # SV2TTS - tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), + tf.compat.v1.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), name="speaker_embeddings") ] # Create queue for buffering data - queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, + queue = tf.queue.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32], name="input_queue") self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \ @@ -100,7 +100,7 @@ def __init__(self, coordinator, metadata_filename, hparams): self.speaker_embeddings.set_shape(self._placeholders[6].shape) # Create eval queue for buffering eval data - eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, + eval_queue = tf.queue.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32], name="eval_queue") self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \ diff --git a/synthesizer/inference.py b/synthesizer/inference.py index 99fb77810..86bd2fb84 100644 --- a/synthesizer/inference.py +++ b/synthesizer/inference.py @@ -54,7 +54,7 @@ def load(self): """ if self._low_mem: raise Exception("Cannot load the synthesizer permanently in low mem mode") - tf.reset_default_graph() + tf.compat.v1.reset_default_graph() self._model = Tacotron2(self.checkpoint_fpath, hparams) def synthesize_spectrograms(self, texts: List[str], @@ -88,7 +88,7 @@ def synthesize_spectrograms(self, texts: List[str], @staticmethod def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts): # Load the model and forward the inputs - tf.reset_default_graph() + tf.compat.v1.reset_default_graph() model = Tacotron2(checkpoint_fpath, hparams) specs, alignments = model.my_synthesize(embeddings, texts) @@ -134,4 +134,3 @@ def griffin_lim(mel): with the same parameters present in hparams.py. """ return audio.inv_mel_spectrogram(mel, hparams) - \ No newline at end of file diff --git a/synthesizer/models/attention.py b/synthesizer/models/attention.py index 58892ad74..1f40d4563 100644 --- a/synthesizer/models/attention.py +++ b/synthesizer/models/attention.py @@ -60,10 +60,10 @@ def _location_sensitive_score(W_query, W_fil, W_keys): dtype = W_query.dtype num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] - v_a = tf.get_variable( + v_a = tf.compat.v1.get_variable( "attention_variable_projection", shape=[num_units], dtype=dtype, initializer=tf.contrib.layers.xavier_initializer()) - b_a = tf.get_variable( + b_a = tf.compat.v1.get_variable( "attention_bias", shape=[num_units], dtype=dtype, initializer=tf.zeros_initializer()) @@ -155,10 +155,10 @@ def __init__(self, probability_fn=normalization_function, name=name) - self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, + self.location_convolution = tf.compat.v1.layers.Conv1D(filters=hparams.attention_filters, kernel_size=hparams.attention_kernel, padding="same", use_bias=True, bias_initializer=tf.zeros_initializer(), name="location_features_convolution") - self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, + self.location_layer = tf.compat.v1.layers.Dense(units=num_units, use_bias=False, dtype=tf.float32, name="location_features_layer") self._cumulate = cumulate_weights diff --git a/synthesizer/models/helpers.py b/synthesizer/models/helpers.py index eec069940..4e58ccd00 100644 --- a/synthesizer/models/helpers.py +++ b/synthesizer/models/helpers.py @@ -119,7 +119,7 @@ def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, n #Pick previous outputs randomly with respect to teacher forcing ratio next_inputs = tf.cond( - tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), + tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), lambda: self._targets[:, time, :], #Teacher-forcing: return true frame lambda: outputs[:,-self._output_dim:]) diff --git a/synthesizer/models/modules.py b/synthesizer/models/modules.py index 769657220..f9fe7ebe9 100644 --- a/synthesizer/models/modules.py +++ b/synthesizer/models/modules.py @@ -1,4 +1,5 @@ import tensorflow as tf +import torch class HighwayNet: @@ -6,12 +7,12 @@ def __init__(self, units, name=None): self.units = units self.scope = "HighwayNet" if name is None else name - self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H") - self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T", + self.H_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.relu, name="H") + self.T_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T", bias_initializer=tf.constant_initializer(-1.)) def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): H = self.H_layer(inputs) T = self.T_layer(inputs) return H * T + inputs * (1. - T) @@ -38,8 +39,8 @@ def __init__(self, K, conv_channels, pool_size, projections, projection_kernel_s self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope)) def __call__(self, inputs, input_lengths): - with tf.variable_scope(self.scope): - with tf.variable_scope("conv_bank"): + with tf.compat.v1.variable_scope(self.scope): + with tf.compat.v1.variable_scope("conv_bank"): # Convolution bank: concatenate on the last axis to stack channels from all # convolutions # The convolution bank uses multiple different kernel sizes to have many insights @@ -71,7 +72,7 @@ def __call__(self, inputs, input_lengths): # Additional projection in case of dimension mismatch (for HighwayNet "residual" # connection) if highway_input.shape[2] != self.highway_units: - highway_input = tf.layers.dense(highway_input, self.highway_units) + highway_input = tf.compat.v1.layers.Dense(highway_input, self.highway_units) # 4-layer HighwayNet for highwaynet in self.highwaynet_layers: @@ -88,7 +89,7 @@ def __call__(self, inputs, input_lengths): return tf.concat(outputs, axis=2) # Concat forward and backward outputs -class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell): +class ZoneoutLSTMCell(tf.compat.v1.nn.rnn_cell.RNNCell): """Wrapper for tf LSTM to create Zoneout LSTM Cell inspired by: @@ -108,8 +109,11 @@ def __init__(self, num_units, is_training, zoneout_factor_cell=0., zoneout_facto if zm < 0. or zs > 1.: raise ValueError("One/both provided Zoneout factors are not in [0, 1]") - - self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name) + + if torch.cuda.is_available(): + self._cell = tf.contrib.cudnn_rnn.CudnnLSTM(num_units, name=name) + else: + self._cell = tf.contrib.rnn.LSTMBlockCell(num_units, name=name) self._zoneout_cell = zoneout_factor_cell self._zoneout_outputs = zoneout_factor_output self.is_training = is_training @@ -144,16 +148,13 @@ def __call__(self, inputs, state, scope=None): if self.is_training: # nn.dropout takes keep_prob (probability to keep activations) not drop_prob ( # probability to mask activations)! - c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, - (1 - self._zoneout_cell)) + prev_c - h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, - (1 - self._zoneout_outputs)) + prev_h - + c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, (1 - self._zoneout_cell)) + prev_c + h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, (1 - self._zoneout_outputs)) + prev_h else: c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h - new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c, + new_state = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c, h]) return output, new_state @@ -175,7 +176,7 @@ def __init__(self, is_training, hparams, activation=tf.nn.relu, scope=None): """ super(EncoderConvolutions, self).__init__() self.is_training = is_training - + self.kernel_size = hparams.enc_conv_kernel_size self.channels = hparams.enc_conv_channels self.activation = activation @@ -184,7 +185,7 @@ def __init__(self, is_training, hparams, activation=tf.nn.relu, scope=None): self.enc_conv_num_layers = hparams.enc_conv_num_layers def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): x = inputs for i in range(self.enc_conv_num_layers): x = conv1d(x, self.kernel_size, self.channels, self.activation, @@ -226,8 +227,8 @@ def __init__(self, is_training, size=256, zoneout=0.1, scope=None): name="encoder_bw_LSTM") def __call__(self, inputs, input_lengths): - with tf.variable_scope(self.scope): - outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn( + with tf.compat.v1.variable_scope(self.scope): + outputs, (fw_state, bw_state) = tf.compat.v1.nn.bidirectional_dynamic_rnn( self._fw_cell, self._bw_cell, inputs, @@ -239,7 +240,8 @@ def __call__(self, inputs, input_lengths): class Prenet: - """Two fully connected layers used as an information bottleneck for the attention. + """ + Two fully connected layers used as an information bottleneck for the attention. """ def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu, @@ -263,13 +265,13 @@ def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activati def __call__(self, inputs): x = inputs - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): for i, size in enumerate(self.layers_sizes): - dense = tf.layers.dense(x, units=size, activation=self.activation, + dense = tf.compat.v1.layers.dense(x, units=size, activation=self.activation, name="dense_{}".format(i + 1)) # The paper discussed introducing diversity in generation at inference time # by using a dropout of 0.5 only in prenet layers (in both training and inference). - x = tf.layers.dropout(dense, rate=self.drop_rate, training=True, + x = tf.compat.v1.layers.dropout(dense, rate=self.drop_rate, training=True, name="dropout_{}".format(i + 1) + self.scope) return x @@ -302,10 +304,10 @@ def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None): name="decoder_LSTM_{}".format(i + 1)) for i in range(layers)] - self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True) + self._cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(self.rnn_layers, state_is_tuple=True) def __call__(self, inputs, states): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): return self._cell(inputs, states) @@ -327,14 +329,14 @@ def __init__(self, shape=80, activation=None, scope=None): self.activation = activation self.scope = "Linear_projection" if scope is None else scope - self.dense = tf.layers.Dense(units=shape, activation=activation, + self.dense = tf.compat.v1.layers.Dense(units=shape, activation=activation, name="projection_{}".format(self.scope)) def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): # If activation==None, this returns a simple Linear projection # else the projection will be passed through an activation function - # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation, + # output = tf.compat.v1.layers.Dense(inputs, units=self.shape, activation=self.activation, # name="projection_{}".format(self.scope)) output = self.dense(inputs) @@ -362,7 +364,7 @@ def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None): self.scope = "stop_token_projection" if scope is None else scope def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): output = tf.layers.dense(inputs, units=self.shape, activation=None, name="projection_{}".format(self.scope)) @@ -399,7 +401,7 @@ def __init__(self, is_training, hparams, activation=tf.nn.tanh, scope=None): self.drop_rate = hparams.tacotron_dropout_rate def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): x = inputs for i in range(self.postnet_num_layers - 1): x = conv1d(x, self.kernel_size, self.channels, self.activation, @@ -412,16 +414,16 @@ def __call__(self, inputs): def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope): - with tf.variable_scope(scope): - conv1d_output = tf.layers.conv1d( + with tf.compat.v1.variable_scope(scope): + conv1d_output = tf.compat.v1.layers.conv1d( inputs, filters=channels, kernel_size=kernel_size, activation=None, padding="same") - batched = tf.layers.batch_normalization(conv1d_output, training=is_training) + batched = tf.compat.v1.layers.batch_normalization(conv1d_output, training=is_training) activated = activation(batched) - return tf.layers.dropout(activated, rate=drop_rate, training=is_training, + return tf.compat.v1.layers.dropout(activated, rate=drop_rate, training=is_training, name="dropout_{}".format(scope)) diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py index 9c4de4df0..4b952c100 100644 --- a/synthesizer/models/tacotron.py +++ b/synthesizer/models/tacotron.py @@ -24,11 +24,11 @@ def split_func(x, split_pos): class Tacotron(): """Tacotron-2 Feature prediction Model. """ - + def __init__(self, hparams): self._hparams = hparams - - def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, + + def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ @@ -45,55 +45,55 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: - raise ValueError("no multi targets were provided but token_targets were given") + raise ValueError( + "no multi targets were provided but token_targets were given") if mel_targets is not None and stop_token_targets is None and not gta: - raise ValueError("Mel targets are provided without corresponding token_targets") + raise ValueError( + "Mel targets are provided without corresponding token_targets") if not gta and self._hparams.predict_linear == True and linear_targets is None and \ - is_training: + is_training: raise ValueError( "Model is set to use post processing to predict linear spectrograms in training " - "but no linear targets given!") + "but no linear targets given!") if gta and linear_targets is not None: - raise ValueError("Linear spectrogram prediction is not supported in GTA mode!") + raise ValueError( + "Linear spectrogram prediction is not supported in GTA mode!") if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( "Model set to mask paddings but no targets lengths provided for the mask!") if is_training and is_evaluating: raise RuntimeError( "Model can not be in training and evaluation modes at the same time!") - - split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \ - self._hparams.split_on_cpu else "/gpu:{}".format( - self._hparams.tacotron_gpu_start_idx) + + split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else "/gpu:{}".format(self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus - + tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = \ tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \ - targets_lengths is not None else targets_lengths - + targets_lengths is not None else targets_lengths + ### SV2TTS ### - tower_embed_targets = tf.split(embed_targets, num_or_size_splits=hp.tacotron_num_gpus, axis=0) - + ############## - - p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) - p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]], - lout_float) if mel_targets is not None else mel_targets - p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]], - lout_float) if stop_token_targets is not None else \ - stop_token_targets - + p_inputs = tf.numpy_function( + split_func, [inputs, split_infos[:, 0]], lout_int) + p_mel_targets = tf.numpy_function(split_func, [mel_targets, split_infos[:, 1]], + lout_float) if mel_targets is not None else mel_targets + p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]], + lout_float) if stop_token_targets is not None else \ + stop_token_targets + tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] - + batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels for i in range(hp.tacotron_num_gpus): @@ -104,69 +104,74 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) - + self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] - + tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_cond_outputs = [] tower_residual = [] tower_projected_residual = [] - + # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): - with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", - worker_device=gpus[i])): - with tf.variable_scope("inference") as scope: - assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled") + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device=split_device, + worker_device=gpus[i])): + with tf.compat.v1.variable_scope("inference") as scope: + assert hp.tacotron_teacher_forcing_mode in ( + "constant", "scheduled") if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training: assert global_step is not None - - # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit + + # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit # post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta - + # Embeddings ==> [batch_size, sequence_length, embedding_dim] - self.embedding_table = tf.get_variable( + self.embedding_table = tf.compat.v1.get_variable( "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32) - embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i]) - + embedded_inputs = tf.nn.embedding_lookup( + self.embedding_table, tower_inputs[i]) + # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( - EncoderConvolutions(is_training, hparams=hp, scope="encoder_convolutions"), + EncoderConvolutions( + is_training, hparams=hp, scope="encoder_convolutions"), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope="encoder_LSTM")) - - encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) - + + encoder_outputs = encoder_cell( + embedded_inputs, tower_input_lengths[i]) + # For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape - - + ### SV2TT2 ### - + # Append the speaker embedding to the encoder output at each timestep - tileable_shape = [-1, 1, self._hparams.speaker_embedding_size] - tileable_embed_targets = tf.reshape(tower_embed_targets[i], tileable_shape) - tiled_embed_targets = tf.tile(tileable_embed_targets, - [1, tf.shape(encoder_outputs)[1], 1]) - encoder_cond_outputs = tf.concat((encoder_outputs, tiled_embed_targets), 2) - + tileable_shape = [-1, 1, + self._hparams.speaker_embedding_size] + tileable_embed_targets = tf.reshape( + tower_embed_targets[i], tileable_shape) + tiled_embed_targets = tf.tile(tileable_embed_targets, + [1, tf.shape(encoder_outputs)[1], 1]) + encoder_cond_outputs = tf.concat( + (encoder_outputs, tiled_embed_targets), 2) + ############## - - + # Decoder Parts # Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope="decoder_prenet") # Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, - encoder_cond_outputs, + encoder_cond_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( @@ -186,7 +191,7 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, stop_projection = StopProjection(is_training or is_evaluating, shape=hp .outputs_per_step, scope="stop_token_projection") - + # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, @@ -194,86 +199,93 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, decoder_lstm, frame_projection, stop_projection) - + # Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) - + # initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) - + # Only use max iterations at synthesis time - max_iters = hp.max_iters if not (is_training or is_evaluating) else None - + max_iters = hp.max_iters if not ( + is_training or is_evaluating) else None + # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( - CustomDecoder(decoder_cell, self.helper, decoder_init_state), + CustomDecoder(decoder_cell, self.helper, + decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) - - # Reshape outputs to be one output per entry + + # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] - decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) - stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) - + decoder_output = tf.reshape( + frames_prediction, [batch_size, -1, hp.num_mels]) + stop_token_prediction = tf.reshape( + stop_token_prediction, [batch_size, -1]) + # Postnet - postnet = Postnet(is_training, hparams=hp, scope="postnet_convolutions") - - # Compute residual using post-net ==> [batch_size, decoder_steps * r, + postnet = Postnet(is_training, hparams=hp, + scope="postnet_convolutions") + + # Compute residual using post-net ==> [batch_size, decoder_steps * r, # postnet_channels] residual = postnet(decoder_output) - - # Project residual to same dimension as mel spectrogram + + # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] - residual_projection = FrameProjection(hp.num_mels, scope="postnet_projection") + residual_projection = FrameProjection( + hp.num_mels, scope="postnet_projection") projected_residual = residual_projection(residual) - + # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual - + if post_condition: - # Add post-processing CBHG. This does a great job at extracting features - # from mels before projection to Linear specs. + # Add post-processing CBHG. This does a great job at extracting features + # from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name="CBHG_postnet") - + # [batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) - + # Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection(hp.num_freq, scope="cbhg_linear_specs_projection") - + # [batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) - + # Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) - + self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) - self.tower_stop_token_prediction.append(stop_token_prediction) + self.tower_stop_token_prediction.append( + stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_cond_outputs.append(encoder_cond_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) - + if post_condition: self.tower_linear_outputs.append(linear_outputs) log("initialisation done {}".format(gpus[i])) - + if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs @@ -282,44 +294,53 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, # self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets - - self.all_vars = tf.trainable_variables() - + + self.all_vars = tf.compat.v1.trainable_variables() + log("Initialized Tacotron model. Dimensions (? = dynamic shape): ") log(" Train mode: {}".format(is_training)) log(" Eval mode: {}".format(is_evaluating)) log(" GTA mode: {}".format(gta)) - log(" Synthesis mode: {}".format(not (is_training or is_evaluating))) + log(" Synthesis mode: {}".format( + not (is_training or is_evaluating))) log(" Input: {}".format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(" device: {}".format(i)) - log(" embedding: {}".format(tower_embedded_inputs[i].shape)) - log(" enc conv out: {}".format(tower_enc_conv_output_shape[i])) - log(" encoder out (cond): {}".format(tower_encoder_cond_outputs[i].shape)) - log(" decoder out: {}".format(self.tower_decoder_output[i].shape)) - log(" residual out: {}".format(tower_residual[i].shape)) - log(" projected residual out: {}".format(tower_projected_residual[i].shape)) - log(" mel out: {}".format(self.tower_mel_outputs[i].shape)) + log(" embedding: {}".format( + tower_embedded_inputs[i].shape)) + log(" enc conv out: {}".format( + tower_enc_conv_output_shape[i])) + log(" encoder out (cond): {}".format( + tower_encoder_cond_outputs[i].shape)) + log(" decoder out: {}".format( + self.tower_decoder_output[i].shape)) + log(" residual out: {}".format( + tower_residual[i].shape)) + log(" projected residual out: {}".format( + tower_projected_residual[i].shape)) + log(" mel out: {}".format( + self.tower_mel_outputs[i].shape)) if post_condition: - log(" linear out: {}".format(self.tower_linear_outputs[i].shape)) - log(" out: {}".format(self.tower_stop_token_prediction[i].shape)) - + log(" linear out: {}".format( + self.tower_linear_outputs[i].shape)) + log(" out: {}".format( + self.tower_stop_token_prediction[i].shape)) + # 1_000_000 is causing syntax problems for some people?! Python please :) log(" Tacotron Parameters {:.3f} Million.".format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000)) - - + def add_loss(self): """Adds loss to the model. Sets "loss" field. initialize must have been called.""" hp = self._hparams - + self.tower_before_loss = [] self.tower_after_loss = [] self.tower_stop_token_loss = [] self.tower_regularization_loss = [] self.tower_linear_loss = [] self.tower_loss = [] - + total_before_loss = 0 total_after_loss = 0 total_stop_token_loss = 0 @@ -329,11 +350,11 @@ def add_loss(self): gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] - + for i in range(hp.tacotron_num_gpus): - with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", - worker_device=gpus[i])): - with tf.variable_scope("loss") as scope: + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + worker_device=gpus[i])): + with tf.compat.v1.variable_scope("loss") as scope: if hp.mask_decoder: # Compute loss of predictions before postnet before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i], @@ -356,24 +377,25 @@ def add_loss(self): linear_loss = 0. else: # Compute loss of predictions before postnet - before = tf.losses.mean_squared_error(self.tower_mel_targets[i], + before = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i], self.tower_decoder_output[i]) # Compute loss after postnet - after = tf.losses.mean_squared_error(self.tower_mel_targets[i], + after = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i], self.tower_mel_outputs[i]) # Compute loss (for learning dynamic generation stop) stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( labels=self.tower_stop_token_targets[i], logits=self.tower_stop_token_prediction[i])) - + # SV2TTS extra L1 loss - l1 = tf.abs(self.tower_mel_targets[i] - self.tower_decoder_output[i]) + l1 = tf.abs( + self.tower_mel_targets[i] - self.tower_decoder_output[i]) linear_loss = tf.reduce_mean(l1) # if hp.predict_linear: # # Compute linear loss # # From https://github.com/keithito/tacotron/blob/tacotron2-work-in - # # -progress/models/tacotron.py + # # -progress/models/tacotron.py # # Prioritize loss for frequencies under 2000 Hz. # l1 = tf.abs(self.tower_linear_targets[i] - self.tower_linear_outputs[i]) # n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_freq) @@ -381,34 +403,34 @@ def add_loss(self): # l1[:, :, 0:n_priority_freq]) # else: # linear_loss = 0. - + # Compute the regularization weight if hp.tacotron_scale_regularization: reg_weight_scaler = 1. / ( - 2 * hp.max_abs_value) if hp.symmetric_mels else 1. / ( + 2 * hp.max_abs_value) if hp.symmetric_mels else 1. / ( hp.max_abs_value) reg_weight = hp.tacotron_reg_weight * reg_weight_scaler else: reg_weight = hp.tacotron_reg_weight - + # Regularize variables # Exclude all types of bias, RNN (Bengio et al. On the difficulty of training recurrent neural networks), embeddings and prediction projection layers. # Note that we consider attention mechanism v_a weights as a prediction projection layer and we don"t regularize it. (This gave better stability) regularization = tf.add_n([tf.nn.l2_loss(v) for v in self.all_vars if not ( - "bias" in v.name or "Bias" in v.name or "_projection" in v.name or "inputs_embedding" in v.name - or "RNN" in v.name or "LSTM" in v.name)]) * reg_weight - + "bias" in v.name or "Bias" in v.name or "_projection" in v.name or "inputs_embedding" in v.name + or "RNN" in v.name or "LSTM" in v.name)]) * reg_weight + # Compute final loss term self.tower_before_loss.append(before) self.tower_after_loss.append(after) self.tower_stop_token_loss.append(stop_token_loss) self.tower_regularization_loss.append(regularization) self.tower_linear_loss.append(linear_loss) - + loss = before + after + stop_token_loss + regularization + linear_loss self.tower_loss.append(loss) - + for i in range(hp.tacotron_num_gpus): total_before_loss += self.tower_before_loss[i] total_after_loss += self.tower_after_loss[i] @@ -416,14 +438,14 @@ def add_loss(self): total_regularization_loss += self.tower_regularization_loss[i] total_linear_loss += self.tower_linear_loss[i] total_loss += self.tower_loss[i] - + self.before_loss = total_before_loss / hp.tacotron_num_gpus self.after_loss = total_after_loss / hp.tacotron_num_gpus self.stop_token_loss = total_stop_token_loss / hp.tacotron_num_gpus self.regularization_loss = total_regularization_loss / hp.tacotron_num_gpus self.linear_loss = total_linear_loss / hp.tacotron_num_gpus self.loss = total_loss / hp.tacotron_num_gpus - + def add_optimizer(self, global_step): """Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. Args: @@ -431,36 +453,37 @@ def add_optimizer(self, global_step): """ hp = self._hparams tower_gradients = [] - + # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] - + grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0] - + with tf.device(grad_device): - with tf.variable_scope("optimizer") as scope: + with tf.compat.v1.variable_scope("optimizer") as scope: if hp.tacotron_decay_learning_rate: self.decay_steps = hp.tacotron_decay_steps self.decay_rate = hp.tacotron_decay_rate self.learning_rate = self._learning_rate_decay( hp.tacotron_initial_learning_rate, global_step) else: - self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate) - - optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, + self.learning_rate = tf.convert_to_tensor( + hp.tacotron_initial_learning_rate) + + optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon) - + # 2. Compute Gradient for i in range(hp.tacotron_num_gpus): # Device placement - with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", - worker_device=gpus[i])): + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + worker_device=gpus[i])): # agg_loss += self.tower_loss[i] - with tf.variable_scope("optimizer") as scope: + with tf.compat.v1.variable_scope("optimizer") as scope: gradients = optimizer.compute_gradients(self.tower_loss[i]) tower_gradients.append(gradients) - + # 3. Average Gradient with tf.device(grad_device): avg_grads = [] @@ -475,47 +498,48 @@ def add_optimizer(self, global_step): # Average over the "tower" dimension. grad = tf.concat(axis=0, values=grads) grad = tf.reduce_mean(grad, 0) - + v = grad_and_vars[0][1] avg_grads.append(grad) vars.append(v) - + self.gradients = avg_grads # Just for causion # https://github.com/Rayhane-mamah/Tacotron-2/issues/11 if hp.tacotron_clip_gradients: - clipped_gradients, _ = tf.clip_by_global_norm(avg_grads, 1.) # __mark 0.5 refer + clipped_gradients, _ = tf.clip_by_global_norm( + avg_grads, 1.) # __mark 0.5 refer else: clipped_gradients = avg_grads - + # Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See: # https://github.com/tensorflow/tensorflow/issues/1122 - with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): + with tf.control_dependencies(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)): self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars), global_step=global_step) - + def _learning_rate_decay(self, init_lr, global_step): ################################################################# # Narrow Exponential Decay: - + # Phase 1: lr = 1e-3 # We only start learning rate decay after 50k steps - + # Phase 2: lr in ]1e-5, 1e-3[ # decay reach minimal value at step 310k - + # Phase 3: lr = 1e-5 # clip by minimal learning rate value (step > 310k) ################################################################# hp = self._hparams - + # Compute natural exponential decay - lr = tf.train.exponential_decay(init_lr, + lr = tf.compat.v1.train.exponential_decay(init_lr, global_step - hp.tacotron_start_decay, # lr = 1e-3 at step 50k self.decay_steps, self.decay_rate, # lr = 1e-5 around step 310k name="lr_exponential_decay") - + # clip learning rate by max and min values (initial and final values) return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr) diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py index b2894aa71..0bfc3d6b9 100644 --- a/synthesizer/preprocess.py +++ b/synthesizer/preprocess.py @@ -8,14 +8,16 @@ from tqdm import tqdm import numpy as np import librosa +import os def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams): # Gather the input directories + if(str(datasets_root)[0] != '/' or str(datasets_root)[1] != ':'): + datasets_root = Path(os.getcwd() + '/' + str(datasets_root)) dataset_root = datasets_root.joinpath("LibriSpeech") - input_dirs = [dataset_root.joinpath("train-clean-100"), - dataset_root.joinpath("train-clean-360")] + input_dirs = [dataset_root.joinpath("train-clean-100"), dataset_root.joinpath("train-clean-360")] print("\n ".join(map(str, ["Using data from:"] + input_dirs))) assert all(input_dir.exists() for input_dir in input_dirs) @@ -82,7 +84,7 @@ def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams) def split_on_silences(wav_fpath, words, end_times, hparams): # Load the audio waveform - wav, _ = librosa.load(wav_fpath, hparams.sample_rate) + wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max @@ -222,4 +224,3 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) job = Pool(n_processes).imap(func, fpaths) list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) - \ No newline at end of file diff --git a/synthesizer/tacotron2.py b/synthesizer/tacotron2.py index 4a5b199ce..e4c68505f 100644 --- a/synthesizer/tacotron2.py +++ b/synthesizer/tacotron2.py @@ -12,13 +12,13 @@ class Tacotron2: def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"): log("Constructing model: %s" % model_name) #Force the batch size to be known in order to use attention masking in batch synthesis - inputs = tf.placeholder(tf.int32, (None, None), name="inputs") - input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths") - speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size), + inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs") + input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths") + speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size), name="speaker_embeddings") - targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") - split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") - with tf.variable_scope("Tacotron_model") as scope: + targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") + split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") + with tf.compat.v1.variable_scope("Tacotron_model") as scope: self.model = create_model(model_name, hparams) if gta: self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta, @@ -52,14 +52,14 @@ def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"): log("Loading checkpoint: %s" % checkpoint_path) #Memory allocation on the GPUs as needed - config = tf.ConfigProto() + config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True - self.session = tf.Session(config=config) - self.session.run(tf.global_variables_initializer()) + self.session = tf.compat.v1.Session(config=config) + self.session.run(tf.compat.v1.global_variables_initializer()) - saver = tf.train.Saver() + saver = tf.compat.v1.train.Saver() saver.restore(self.session, checkpoint_path) def my_synthesize(self, speaker_embeds, texts): diff --git a/synthesizer/train.py b/synthesizer/train.py index 4fe6bbda3..16d99f4a4 100644 --- a/synthesizer/train.py +++ b/synthesizer/train.py @@ -20,61 +20,71 @@ def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoi # Create tensorboard projector config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig() config.model_checkpoint_path = checkpoint_path - + for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta): # Initialize config embedding = config.embeddings.add() # Specifiy the embedding variable and the metadata embedding.tensor_name = embedding_name embedding.metadata_path = path_to_meta - + # Project the embeddings to space dimensions for visualization - tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config) + tf.contrib.tensorboard.plugins.projector.visualize_embeddings( + summary_writer, config) def add_train_stats(model, hparams): - with tf.variable_scope("stats") as scope: + with tf.compat.v1.variable_scope("stats") as scope: for i in range(hparams.tacotron_num_gpus): - tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i]) - tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i]) - tf.summary.scalar("before_loss", model.before_loss) - tf.summary.scalar("after_loss", model.after_loss) - + tf.compat.v1.summary.histogram( + "mel_outputs %d" % i, model.tower_mel_outputs[i]) + tf.compat.v1.summary.histogram( + "mel_targets %d" % i, model.tower_mel_targets[i]) + tf.compat.v1.summary.scalar("before_loss", model.before_loss) + tf.compat.v1.summary.scalar("after_loss", model.after_loss) + if hparams.predict_linear: - tf.summary.scalar("linear_loss", model.linear_loss) + tf.compat.v1.summary.scalar("linear_loss", model.linear_loss) for i in range(hparams.tacotron_num_gpus): - tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i]) - tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i]) - - tf.summary.scalar("regularization_loss", model.regularization_loss) - tf.summary.scalar("stop_token_loss", model.stop_token_loss) - tf.summary.scalar("loss", model.loss) - tf.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed + tf.compat.v1.summary.histogram( + "mel_outputs %d" % i, model.tower_linear_outputs[i]) + tf.compat.v1.summary.histogram( + "mel_targets %d" % i, model.tower_linear_targets[i]) + + tf.compat.v1.summary.scalar( + "regularization_loss", model.regularization_loss) + tf.compat.v1.summary.scalar("stop_token_loss", model.stop_token_loss) + tf.compat.v1.summary.scalar("loss", model.loss) + # Control learning rate decay speed + tf.compat.v1.summary.scalar("learning_rate", model.learning_rate) if hparams.tacotron_teacher_forcing_mode == "scheduled": - tf.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing + # Control teacher forcing + tf.compat.v1.summary.scalar("teacher_forcing_ratio", model.ratio) # ratio decay when mode = "scheduled" - gradient_norms = [tf.norm(grad) for grad in model.gradients] - tf.summary.histogram("gradient_norm", gradient_norms) - tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms)) # visualize + gradient_norms = [tf.norm(tensor=grad) for grad in model.gradients] + tf.compat.v1.summary.histogram("gradient_norm", gradient_norms) + tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max( + input_tensor=gradient_norms)) # visualize # gradients (in case of explosion) - return tf.summary.merge_all() + return tf.compat.v1.summary.merge_all() def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, loss): values = [ - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss", - simple_value=before_loss), - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss", - simple_value=after_loss), - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss", - simple_value=stop_token_loss), - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss), + tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/eval_before_loss", simple_value=before_loss), + tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/eval_after_loss", simple_value=after_loss), + tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/stop_token_loss", simple_value=stop_token_loss), + tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss), ] if linear_loss is not None: - values.append(tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss", - simple_value=linear_loss)) - test_summary = tf.Summary(value=values) + values.append(tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/eval_linear_loss", simple_value=linear_loss)) + test_summary = tf.compat.v1.Summary(value=values) summary_writer.add_summary(test_summary, step) @@ -83,9 +93,9 @@ def time_string(): def model_train_mode(args, feeder, hparams, global_step): - with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope: + with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope: model = create_model("Tacotron", hparams) - model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, + model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, feeder.mel_targets, feeder.token_targets, targets_lengths=feeder.targets_lengths, global_step=global_step, is_training=True, split_infos=feeder.split_infos) @@ -96,11 +106,11 @@ def model_train_mode(args, feeder, hparams, global_step): def model_test_mode(args, feeder, hparams, global_step): - with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope: + with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope: model = create_model("Tacotron", hparams) - model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, + model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_speaker_embeddings, feeder.eval_mel_targets, - feeder.eval_token_targets, targets_lengths=feeder.eval_targets_lengths, + feeder.eval_token_targets, targets_lengths=feeder.eval_targets_lengths, global_step=global_step, is_training=False, is_evaluating=True, split_infos=feeder.eval_split_infos) model.add_loss() @@ -126,28 +136,28 @@ def train(log_dir, args, hparams): os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) - + checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt") metadat_fpath = os.path.join(args.synthesizer_root, "train.txt") - + log("Checkpoint path: {}".format(checkpoint_fpath)) log("Loading training data from: {}".format(metadat_fpath)) log("Using model: Tacotron") log(hparams_debug_string()) - + # Start by setting a seed for repeatability - tf.set_random_seed(hparams.tacotron_random_seed) - + tf.compat.v1.set_random_seed(hparams.tacotron_random_seed) + # Set up data feeder coord = tf.train.Coordinator() - with tf.variable_scope("datafeeder") as scope: + with tf.compat.v1.variable_scope("datafeeder") as scope: feeder = Feeder(coord, metadat_fpath, hparams) - + # Set up model: global_step = tf.Variable(0, name="global_step", trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) - + # Embeddings metadata char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv") if not os.path.isfile(char_embedding_meta): @@ -155,143 +165,151 @@ def train(log_dir, args, hparams): for symbol in symbols: if symbol == " ": symbol = "\\s" # For visual purposes, swap space with \s - + f.write("{}\n".format(symbol)) - + char_embedding_meta = char_embedding_meta.replace(log_dir, "..") - + # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) - saver = tf.train.Saver(max_to_keep=5) - - log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps)) - + saver = tf.compat.v1.train.Saver(max_to_keep=5) + + log("Tacotron training set to a maximum of {} steps".format( + args.tacotron_train_steps)) + # Memory allocation on the GPU as needed - config = tf.ConfigProto() + config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True + #config.gpu_options.per_process_gpu_memory_fraction = 0.4 config.allow_soft_placement = True - + # Train - with tf.Session(config=config) as sess: + with tf.compat.v1.Session(config=config) as sess: try: - summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) - - sess.run(tf.global_variables_initializer()) - + summary_writer = tf.compat.v1.summary.FileWriter( + tensorboard_dir, sess.graph) + sess.run(tf.compat.v1.global_variables_initializer()) + # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) - + if checkpoint_state and checkpoint_state.model_checkpoint_path: log("Loading checkpoint {}".format(checkpoint_state.model_checkpoint_path), slack=True) - saver.restore(sess, checkpoint_state.model_checkpoint_path) - + saver.restore( + sess, checkpoint_state.model_checkpoint_path) + else: log("No model to load at {}".format(save_dir), slack=True) - saver.save(sess, checkpoint_fpath, global_step=global_step) - + saver.save(sess, checkpoint_fpath, + global_step=global_step) + except tf.errors.OutOfRangeError as e: log("Cannot restore checkpoint: {}".format(e), slack=True) else: log("Starting new training!", slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) - + # initializing feeder feeder.start_threads(sess) - + # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() - step, loss, opt = sess.run([global_step, model.loss, model.optimize]) + step, loss, opt = sess.run( + [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format( step, time_window.average, loss, loss_window.average) - log(message, end="\r", slack=(step % args.checkpoint_interval == 0)) + log(message, end="\r", slack=(step % + args.checkpoint_interval == 0)) print(message) - + if loss > 100 or np.isnan(loss): - log("Loss exploded to {:.5f} at step {}".format(loss, step)) + log("Loss exploded to {:.5f} at step {}".format( + loss, step)) raise Exception("Loss exploded") - + if step % args.summary_interval == 0: log("\nWriting summary at step {}".format(step)) summary_writer.add_summary(sess.run(stats), step) - + if step % args.eval_interval == 0: # Run eval and save eval stats log("\nRunning evaluation at step {}".format(step)) - + eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None - + if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \ - mel_t, t_len, align, lin_p, lin_t = sess.run( - [ - eval_model.tower_loss[0], eval_model.tower_before_loss[0], - eval_model.tower_after_loss[0], - eval_model.tower_stop_token_loss[0], - eval_model.tower_linear_loss[0], - eval_model.tower_mel_outputs[0][0], - eval_model.tower_mel_targets[0][0], - eval_model.tower_targets_lengths[0][0], - eval_model.tower_alignments[0][0], - eval_model.tower_linear_outputs[0][0], - eval_model.tower_linear_targets[0][0], - ]) + mel_t, t_len, align, lin_p, lin_t = sess.run( + [ + eval_model.tower_loss[0], eval_model.tower_before_loss[0], + eval_model.tower_after_loss[0], + eval_model.tower_stop_token_loss[0], + eval_model.tower_linear_loss[0], + eval_model.tower_mel_outputs[0][0], + eval_model.tower_mel_targets[0][0], + eval_model.tower_targets_lengths[0][0], + eval_model.tower_alignments[0][0], + eval_model.tower_linear_outputs[0][0], + eval_model.tower_linear_targets[0][0], + ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) - + wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, "step-{}-eval-wave-from-linear.wav".format( step)), sr=hparams.sample_rate) - + else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len,\ - align = sess.run( - [ - eval_model.tower_loss[0], eval_model.tower_before_loss[0], - eval_model.tower_after_loss[0], - eval_model.tower_stop_token_loss[0], - eval_model.tower_mel_outputs[0][0], - eval_model.tower_mel_targets[0][0], - eval_model.tower_targets_lengths[0][0], - eval_model.tower_alignments[0][0] - ]) + align = sess.run( + [ + eval_model.tower_loss[0], eval_model.tower_before_loss[0], + eval_model.tower_after_loss[0], + eval_model.tower_stop_token_loss[0], + eval_model.tower_mel_outputs[0][0], + eval_model.tower_mel_targets[0][0], + eval_model.tower_targets_lengths[0][0], + eval_model.tower_alignments[0][0] + ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) - + eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) - stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) - + stop_token_loss = sum( + stop_token_losses) / len(stop_token_losses) + log("Saving eval log to {}..".format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, "step-{}-eval-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) - + plot.plot_alignment(align, os.path.join(eval_plot_dir, "step-{}-eval-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format("Tacotron", @@ -301,7 +319,7 @@ def train(log_dir, args, hparams): max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, "step-{" - "}-eval-mel-spectrogram.png".format( + "}-eval-mel-spectrogram.png".format( step)), title="{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), @@ -309,7 +327,7 @@ def train(log_dir, args, hparams): eval_loss), target_spectrogram=mel_t, max_len=t_len) - + if hparams.predict_linear: plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, "step-{}-eval-linear-spectrogram.png".format( @@ -318,17 +336,16 @@ def train(log_dir, args, hparams): "Tacotron", time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) - + log("Eval loss for global step {}: {:.3f}".format(step, eval_loss)) log("Writing eval summary!") add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) - - if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \ - step == 300: + + if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps: # Save model and current global step saver.save(sess, checkpoint_fpath, global_step=global_step) - + log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..") input_seq, mel_prediction, alignment, target, target_length = sess.run([ model.tower_inputs[0][0], @@ -337,21 +354,23 @@ def train(log_dir, args, hparams): model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) - + # save predicted mel spectrogram to disk (debug) mel_filename = "mel-prediction-step-{}.npy".format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) - + # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, - os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)), + os.path.join( + wav_dir, "step-{}-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) - + # save alignment plot to disk (control purposes) plot.plot_alignment(alignment, - os.path.join(plot_dir, "step-{}-align.png".format(step)), + os.path.join( + plot_dir, "step-{}-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), step, loss), @@ -365,23 +384,24 @@ def train(log_dir, args, hparams): step, loss), target_spectrogram=target, max_len=target_length) - log("Input at step {}: {}".format(step, sequence_to_text(input_seq))) - + log("Input at step {}: {}".format( + step, sequence_to_text(input_seq))) + if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: # Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) - + # Update Projector log("\nSaving Model Character Embeddings visualization..") add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) log("Tacotron Character embeddings have been updated on tensorboard!") - + log("Tacotron training complete after {} global steps!".format( args.tacotron_train_steps), slack=True) return save_dir - + except Exception as e: log("Exiting due to exception: {}".format(e), slack=True) traceback.print_exc() diff --git a/synthesizer_preprocess_audio.py b/synthesizer_preprocess_audio.py index a0dc47b4d..2f52cd86d 100644 --- a/synthesizer_preprocess_audio.py +++ b/synthesizer_preprocess_audio.py @@ -12,7 +12,7 @@ "vocoder for training.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - parser.add_argument("datasets_root", type=Path, help=\ + parser.add_argument('-d', "--datasets_root", type=Path, default='./datasets/', help=\ "Path to the directory containing your LibriSpeech/TTS datasets.") parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\ "Path to the output directory that will contain the mel spectrograms, the audios and the " diff --git a/synthesizer_preprocess_embeds.py b/synthesizer_preprocess_embeds.py index 94f864d5d..afba770d1 100644 --- a/synthesizer_preprocess_embeds.py +++ b/synthesizer_preprocess_embeds.py @@ -9,7 +9,7 @@ description="Creates embeddings for the synthesizer from the LibriSpeech utterances.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - parser.add_argument("synthesizer_root", type=Path, help=\ + parser.add_argument('-d', "--synthesizer_root", type=Path, default='./datasets/SV2TTS/synthesizer/', help=\ "Path to the synthesizer training data that contains the audios and the train.txt file. " "If you let everything as default, it should be /SV2TTS/synthesizer/.") parser.add_argument("-e", "--encoder_model_fpath", type=Path, diff --git a/synthesizer_train.py b/synthesizer_train.py index 4d46bcb80..43dae3924 100644 --- a/synthesizer_train.py +++ b/synthesizer_train.py @@ -12,44 +12,46 @@ def prepare_run(args): run_name = args.name log_dir = os.path.join(args.models_dir, "logs-{}".format(run_name)) os.makedirs(log_dir, exist_ok=True) - infolog.init(os.path.join(log_dir, "Terminal_train_log"), run_name, args.slack_url) + infolog.init(os.path.join(log_dir, "Terminal_train_log"), + run_name, args.slack_url) return log_dir, modified_hp if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("name", help="Name of the run and of the logging directory.") - parser.add_argument("synthesizer_root", type=str, help=\ - "Path to the synthesizer training data that contains the audios and the train.txt file. " - "If you let everything as default, it should be /SV2TTS/synthesizer/.") - parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\ - "Path to the output directory that will contain the saved model weights and the logs.") + parser.add_argument( + "name", help="Name of the run and of the logging directory.") + parser.add_argument('-d', "--synthesizer_root", type=str, default='./datasets/SV2TTS/synthesizer/', + help="Path to the synthesizer training data that contains the audios and the train.txt file. " + "If you let everything as default, it should be /SV2TTS/synthesizer/.") + parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", + help="Path to the output directory that will contain the saved model weights and the logs.") parser.add_argument("--mode", default="synthesis", help="mode for synthesis of tacotron after training") parser.add_argument("--GTA", default="True", - help="Ground truth aligned synthesis, defaults to True, only considered " - "in Tacotron synthesis mode") + help="Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode") parser.add_argument("--restore", type=bool, default=True, help="Set this to False to do a fresh training") parser.add_argument("--summary_interval", type=int, default=2500, help="Steps between running summary ops") parser.add_argument("--embedding_interval", type=int, default=10000, help="Steps between updating embeddings projection visualization") - parser.add_argument("--checkpoint_interval", type=int, default=2000, # Was 5000 + parser.add_argument("--checkpoint_interval", type=int, default=2000, # Was 5000 help="Steps between writing checkpoints") - parser.add_argument("--eval_interval", type=int, default=100000, # Was 10000 + parser.add_argument("--eval_interval", type=int, default=100000, # Was 10000 help="Steps between eval on test data") - parser.add_argument("--tacotron_train_steps", type=int, default=2000000, # Was 100000 + parser.add_argument("--tacotron_train_steps", type=int, default=2000000, # Was 100000 help="total number of tacotron training steps") - parser.add_argument("--tf_log_level", type=int, default=1, help="Tensorflow C++ log level.") + parser.add_argument("--tf_log_level", type=int, + default=1, help="Tensorflow C++ log level.") parser.add_argument("--slack_url", default=None, help="slack webhook notification destination link") parser.add_argument("--hparams", default="", help="Hyperparameter overrides as a comma-separated list of name=value " - "pairs") + "pairs") args = parser.parse_args() print_args(args, parser) - + log_dir, hparams = prepare_run(args) - + tacotron_train(args, log_dir, hparams) diff --git a/toolbox/__init__.py b/toolbox/__init__.py index 607fa72a0..972f73102 100644 --- a/toolbox/__init__.py +++ b/toolbox/__init__.py @@ -8,6 +8,8 @@ import numpy as np import traceback import sys +import os +import soundfile as sf # Use this directory structure for your datasets, or modify it to fit your needs @@ -102,7 +104,11 @@ def load_from_browser(self, fpath=None): self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name) - name = str(fpath.relative_to(self.datasets_root)) + + if(str(self.datasets_root)[0] == '/' or str(self.datasets_root)[1] == ':'): + name = str(fpath.relative_to(self.datasets_root)) + else: + name = os.getcwd() + '/' + str(fpath) speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name # Select the next utterance @@ -111,14 +117,14 @@ def load_from_browser(self, fpath=None): elif fpath == "": return else: - name = fpath.name - speaker_name = fpath.parent.name + name = str(fpath).replace('\\', '/') + speaker_name = 'Custom' # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for # playback, so as to have a fair comparison with the generated audio - wav = Synthesizer.load_preprocess_wav(fpath) + wav = Synthesizer.load_preprocess_wav(name) self.ui.log("Loaded %s" % name) - + self.filename = os.path.basename(name) self.add_real_utterance(wav, name, speaker_name) def record(self): @@ -211,6 +217,9 @@ def vocoder_progress(i, seq_len, b_size, gen_rate): wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) + # Save it + sf.write('./Custom_%s.wav' % self.filename, wav, Synthesizer.sample_rate) + # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): diff --git a/toolbox/ui.py b/toolbox/ui.py index c40fca500..9b77b3db4 100644 --- a/toolbox/ui.py +++ b/toolbox/ui.py @@ -188,7 +188,7 @@ def browse_file(self): caption="Select an audio file", filter="Audio Files (*.mp3 *.flac *.wav *.m4a)" ) - return Path(fpath[0]) if fpath[0] != "" else "" + return str(fpath[0]) if fpath[0] != "" else "" @staticmethod def repopulate_box(box, items, random=False): diff --git a/utils/argutils.py b/utils/argutils.py index db4168302..e292769e0 100644 --- a/utils/argutils.py +++ b/utils/argutils.py @@ -37,4 +37,3 @@ def print_args(args: argparse.Namespace, parser=None): param, value = items[i] print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value)) print("") - \ No newline at end of file diff --git a/utils/profiler.py b/utils/profiler.py index 17175b9e1..66ef4033d 100644 --- a/utils/profiler.py +++ b/utils/profiler.py @@ -42,4 +42,3 @@ def summarize(self): print(" %s mean: %4.0fms std: %4.0fms" % (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000)) print("", flush=True) - \ No newline at end of file diff --git a/vocoder/inference.py b/vocoder/inference.py index 19e639c1a..7e546845d 100644 --- a/vocoder/inference.py +++ b/vocoder/inference.py @@ -6,7 +6,7 @@ _model = None # type: WaveRNN def load_model(weights_fpath, verbose=True): - global _model + global _model, _device if verbose: print("Building Wave-RNN") @@ -23,11 +23,17 @@ def load_model(weights_fpath, verbose=True): hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode - ).cuda() + ) + + if torch.cuda.is_available(): + _model = _model.cuda() + _device = torch.device('cuda') + else: + _device = torch.device('cpu') if verbose: print("Loading model weights at %s" % weights_fpath) - checkpoint = torch.load(weights_fpath) + checkpoint = torch.load(weights_fpath, _device) _model.load_state_dict(checkpoint['model_state']) _model.eval() diff --git a/vocoder/models/fatchord_version.py b/vocoder/models/fatchord_version.py index 798842d72..429572bd0 100644 --- a/vocoder/models/fatchord_version.py +++ b/vocoder/models/fatchord_version.py @@ -157,7 +157,10 @@ def generate(self, mels, batched, target, overlap, mu_law, progress_callback=Non rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad(): - mels = mels.cuda() + if torch.cuda.is_available(): + mels = mels.cuda() + else: + mels = mels.cpu() wave_len = (mels.size(-1) - 1) * self.hop_length mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both') mels, aux = self.upsample(mels.transpose(1, 2)) @@ -168,9 +171,14 @@ def generate(self, mels, batched, target, overlap, mu_law, progress_callback=Non b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).cuda() - h2 = torch.zeros(b_size, self.rnn_dims).cuda() - x = torch.zeros(b_size, 1).cuda() + if torch.cuda.is_available(): + h1 = torch.zeros(b_size, self.rnn_dims).cuda() + h2 = torch.zeros(b_size, self.rnn_dims).cuda() + x = torch.zeros(b_size, 1).cuda() + else: + h1 = torch.zeros(b_size, self.rnn_dims).cpu() + h2 = torch.zeros(b_size, self.rnn_dims).cpu() + x = torch.zeros(b_size, 1).cpu() d = self.aux_dims aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)] @@ -260,7 +268,10 @@ def pad_tensor(self, x, pad, side='both'): # i.e., it won't generalise to other shapes/dims b, t, c = x.size() total = t + 2 * pad if side == 'both' else t + pad - padded = torch.zeros(b, total, c).cuda() + if torch.cuda.is_available(): + padded = torch.zeros(b, total, c).cuda() + else: + padded = torch.zeros(b, total, c).cpu() if side == 'before' or side == 'both': padded[:, pad:pad + t, :] = x elif side == 'after': @@ -306,7 +317,10 @@ def fold_with_overlap(self, x, target, overlap): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side='after') - folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + if torch.cuda.is_available(): + folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + else: + folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu() # Get the values for the folded tensor for i in range(num_folds): diff --git a/vocoder_preprocess.py b/vocoder_preprocess.py index 415ed37ac..be12990d5 100644 --- a/vocoder_preprocess.py +++ b/vocoder_preprocess.py @@ -38,4 +38,3 @@ class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptio args.out_dir = os.path.join(args.datasets_root, "SV2TTS", "vocoder") run_synthesis(args.in_dir, args.out_dir, args.model_dir, modified_hp) - \ No newline at end of file diff --git a/vocoder_train.py b/vocoder_train.py index d712ffa3e..55717b597 100644 --- a/vocoder_train.py +++ b/vocoder_train.py @@ -53,4 +53,3 @@ # Run the training print_args(args, parser) train(**vars(args)) - \ No newline at end of file