From 378ab4b624cde524126729dc46c9653d3e337aab Mon Sep 17 00:00:00 2001 From: pusalieth Date: Mon, 27 Apr 2020 22:14:01 -0700 Subject: [PATCH 01/43] udpated webrtcvad to webrtcvad-wheels --- requirements.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4b54673fa..74f1b2dc5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -tensorflow-gpu>=1.10.0,<=1.14.0 +tensorflow-gpu umap-learn visdom -webrtcvad +webrtcvad-wheels librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0 @@ -13,3 +13,6 @@ inflect PyQt5 multiprocess numba + +## Possible linux fix +#sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip \ No newline at end of file From f4182de3db1ea2c12baf6bc849b9bd23425d865b Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 00:07:28 -0700 Subject: [PATCH 02/43] working cpu model - produced weird autio clip thought --- .gitignore | 1 + README.md | 2 +- demo_cli.py | 32 +++++++++++++++++------------- encoder/inference.py | 2 +- requirements.txt | 10 +++++++--- vocoder/inference.py | 12 ++++++++--- vocoder/models/fatchord_version.py | 26 ++++++++++++++++++------ 7 files changed, 57 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 9401d2ebb..c061d1d14 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ encoder/saved_models/* synthesizer/saved_models/* vocoder/saved_models/* +*.bak \ No newline at end of file diff --git a/README.md b/README.md index 21bee69d4..429009cda 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Before you download any dataset, you can begin by testing your configuration wit If all tests pass, you're good to go. ### Datasets -For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox. +For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). If you chose to use your own dataset you'll need your own audio files, or you will have to record it with the toolbox. ### Toolbox You can then try the toolbox: diff --git a/demo_cli.py b/demo_cli.py index 57bb001c0..867098c4c 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -30,6 +30,8 @@ "overhead but allows to save some GPU memory for lower-end GPUs.") parser.add_argument("--no_sound", action="store_true", help=\ "If True, audio won't be played.") + parser.add_argument( + '--cpu', help='Use CPU.', action='store_true') args = parser.parse_args() print_args(args, parser) if not args.no_sound: @@ -38,22 +40,24 @@ ## Print some environment information (for debugging purposes) print("Running a test of your configuration...\n") - if not torch.cuda.is_available(): - print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " + if args.cpu: + encoder.load_model(args.enc_model_fpath) + elif torch.cuda.is_available(): + device_id = torch.cuda.current_device() + gpu_properties = torch.cuda.get_device_properties(device_id) + print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " + "%.1fGb total memory.\n" % + (torch.cuda.device_count(), + device_id, + gpu_properties.name, + gpu_properties.major, + gpu_properties.minor, + gpu_properties.total_memory / 1e9)) + else: + print("Your PyTorch installation is not configured. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " - "CUDA version matches your PyTorch installation. CPU-only inference is currently " - "not supported.", file=sys.stderr) + "CUDA version matches your PyTorch installation.", file=sys.stderr) quit(-1) - device_id = torch.cuda.current_device() - gpu_properties = torch.cuda.get_device_properties(device_id) - print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " - "%.1fGb total memory.\n" % - (torch.cuda.device_count(), - device_id, - gpu_properties.name, - gpu_properties.major, - gpu_properties.minor, - gpu_properties.total_memory / 1e9)) ## Load the models one by one. diff --git a/encoder/inference.py b/encoder/inference.py index 2447832ff..d769dd172 100644 --- a/encoder/inference.py +++ b/encoder/inference.py @@ -30,7 +30,7 @@ def load_model(weights_fpath: Path, device=None): elif isinstance(device, str): _device = torch.device(device) _model = SpeakerEncoder(_device, torch.device("cpu")) - checkpoint = torch.load(weights_fpath) + checkpoint = torch.load(weights_fpath, _device) _model.load_state_dict(checkpoint["model_state"]) _model.eval() print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"])) diff --git a/requirements.txt b/requirements.txt index 74f1b2dc5..373d5496b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -tensorflow-gpu +# python3.7 or less +tensorflow==1.15 umap-learn visdom webrtcvad-wheels @@ -13,6 +14,9 @@ inflect PyQt5 multiprocess numba +unidecode -## Possible linux fix -#sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip \ No newline at end of file +## Possible linux fix for webrtcvad failure +#sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip +## Possible linux fix for 'OSError: PortAudio library not found' +# sudo apt-get install libportaudio2 \ No newline at end of file diff --git a/vocoder/inference.py b/vocoder/inference.py index 19e639c1a..7e546845d 100644 --- a/vocoder/inference.py +++ b/vocoder/inference.py @@ -6,7 +6,7 @@ _model = None # type: WaveRNN def load_model(weights_fpath, verbose=True): - global _model + global _model, _device if verbose: print("Building Wave-RNN") @@ -23,11 +23,17 @@ def load_model(weights_fpath, verbose=True): hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode - ).cuda() + ) + + if torch.cuda.is_available(): + _model = _model.cuda() + _device = torch.device('cuda') + else: + _device = torch.device('cpu') if verbose: print("Loading model weights at %s" % weights_fpath) - checkpoint = torch.load(weights_fpath) + checkpoint = torch.load(weights_fpath, _device) _model.load_state_dict(checkpoint['model_state']) _model.eval() diff --git a/vocoder/models/fatchord_version.py b/vocoder/models/fatchord_version.py index 798842d72..429572bd0 100644 --- a/vocoder/models/fatchord_version.py +++ b/vocoder/models/fatchord_version.py @@ -157,7 +157,10 @@ def generate(self, mels, batched, target, overlap, mu_law, progress_callback=Non rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad(): - mels = mels.cuda() + if torch.cuda.is_available(): + mels = mels.cuda() + else: + mels = mels.cpu() wave_len = (mels.size(-1) - 1) * self.hop_length mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both') mels, aux = self.upsample(mels.transpose(1, 2)) @@ -168,9 +171,14 @@ def generate(self, mels, batched, target, overlap, mu_law, progress_callback=Non b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).cuda() - h2 = torch.zeros(b_size, self.rnn_dims).cuda() - x = torch.zeros(b_size, 1).cuda() + if torch.cuda.is_available(): + h1 = torch.zeros(b_size, self.rnn_dims).cuda() + h2 = torch.zeros(b_size, self.rnn_dims).cuda() + x = torch.zeros(b_size, 1).cuda() + else: + h1 = torch.zeros(b_size, self.rnn_dims).cpu() + h2 = torch.zeros(b_size, self.rnn_dims).cpu() + x = torch.zeros(b_size, 1).cpu() d = self.aux_dims aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)] @@ -260,7 +268,10 @@ def pad_tensor(self, x, pad, side='both'): # i.e., it won't generalise to other shapes/dims b, t, c = x.size() total = t + 2 * pad if side == 'both' else t + pad - padded = torch.zeros(b, total, c).cuda() + if torch.cuda.is_available(): + padded = torch.zeros(b, total, c).cuda() + else: + padded = torch.zeros(b, total, c).cpu() if side == 'before' or side == 'both': padded[:, pad:pad + t, :] = x elif side == 'after': @@ -306,7 +317,10 @@ def fold_with_overlap(self, x, target, overlap): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side='after') - folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + if torch.cuda.is_available(): + folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + else: + folded = torch.zeros(num_folds, target + 2 * overlap, features).cpu() # Get the values for the folded tensor for i in range(num_folds): From 4bf832c3a97541884634a4e54575a912f002a7ac Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 00:25:11 -0700 Subject: [PATCH 03/43] mp3 fix remove eof --- .gitignore | 2 +- requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index c061d1d14..eaaeb517c 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,4 @@ encoder/saved_models/* synthesizer/saved_models/* vocoder/saved_models/* -*.bak \ No newline at end of file +*.bak diff --git a/requirements.txt b/requirements.txt index 373d5496b..c3d9bf25b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ PyQt5 multiprocess numba unidecode +PySoundFile ## Possible linux fix for webrtcvad failure #sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip From 5734f1a40f58edbd468823fdb3f9aee035d2514e Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 00:26:44 -0700 Subject: [PATCH 04/43] updated Readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 429009cda..280beb68c 100644 --- a/README.md +++ b/README.md @@ -36,8 +36,6 @@ You will need the following whether you plan to use the toolbox only or to retra Run `pip install -r requirements.txt` to install the necessary packages. Additionally you will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1). -A GPU is mandatory, but you don't necessarily need a high tier GPU if you only want to use the toolbox. - ### Pretrained models Download the latest [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). @@ -48,6 +46,8 @@ Before you download any dataset, you can begin by testing your configuration wit If all tests pass, you're good to go. +To use the cpu, use the option `--cpu`. + ### Datasets For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). If you chose to use your own dataset you'll need your own audio files, or you will have to record it with the toolbox. From 485d28501d197a034a90c86c137a3544ec332fa0 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 00:39:41 -0700 Subject: [PATCH 05/43] changed to reflect that only .wav can train --- demo_cli.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/demo_cli.py b/demo_cli.py index 867098c4c..f1632db53 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -121,8 +121,7 @@ while True: try: # Get the reference audio filepath - message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ - "wav, m4a, flac, ...):\n" + message = "Reference voice: enter an audio filepath of a voice to be cloned (.wav only):\n" in_fpath = Path(input(message).replace("\"", "").replace("\'", "")) From b333e733f516a7e13bccf7d1623ab35439fc9aa5 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 01:10:09 -0700 Subject: [PATCH 06/43] correction. model does load more than wav print to console when using cpu --- demo_cli.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/demo_cli.py b/demo_cli.py index f1632db53..f4d715072 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -41,7 +41,7 @@ ## Print some environment information (for debugging purposes) print("Running a test of your configuration...\n") if args.cpu: - encoder.load_model(args.enc_model_fpath) + print("Using CPU for inference.") elif torch.cuda.is_available(): device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) @@ -120,8 +120,9 @@ num_generated = 0 while True: try: - # Get the reference audio filepath - message = "Reference voice: enter an audio filepath of a voice to be cloned (.wav only):\n" + # Get the reference audio filepath + message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ + "wav, m4a, flac, ...):\n" in_fpath = Path(input(message).replace("\"", "").replace("\'", "")) From 4e81aebc2f3b5051b774f1120362c63242d2ab8c Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 01:23:46 -0700 Subject: [PATCH 07/43] weird path problem nothing other than wav works, even though comment says it does... --- demo_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo_cli.py b/demo_cli.py index f4d715072..19aa04b36 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -123,7 +123,7 @@ # Get the reference audio filepath message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ "wav, m4a, flac, ...):\n" - in_fpath = Path(input(message).replace("\"", "").replace("\'", "")) + in_fpath = input(str(message)) ## Computing the embedding From df29bec37fb86467fbccf3d879b1f94da72832de Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 01:29:05 -0700 Subject: [PATCH 08/43] string replace added back just in case people use quotes --- demo_cli.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/demo_cli.py b/demo_cli.py index 19aa04b36..c9ecd3d1c 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -123,7 +123,8 @@ # Get the reference audio filepath message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ "wav, m4a, flac, ...):\n" - in_fpath = input(str(message)) + #in_fpath = input(str(message).replace("\"", '').replace("\'", '')) + in_fpath = '/mnt/c/Users/jakep/Music/Eliza Cassan/Deus Ex Human Revolution - All Eliza Cassan News Broadcasts.wav' ## Computing the embedding @@ -146,7 +147,8 @@ ## Generating the spectrogram - text = input("Write a sentence (+-20 words) to be synthesized:\n") + #text = input("Write a sentence (+-20 words) to be synthesized:\n") + text = 'Hello. This is Eliza Cassan, coming to you live from Picus TV. The future is now.' # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] From f8baa026b8541a7d0fa7a3bb36e85d4149a4073c Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 01:30:00 -0700 Subject: [PATCH 09/43] confirm --- demo_cli.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/demo_cli.py b/demo_cli.py index c9ecd3d1c..e6c2028d3 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -123,8 +123,7 @@ # Get the reference audio filepath message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ "wav, m4a, flac, ...):\n" - #in_fpath = input(str(message).replace("\"", '').replace("\'", '')) - in_fpath = '/mnt/c/Users/jakep/Music/Eliza Cassan/Deus Ex Human Revolution - All Eliza Cassan News Broadcasts.wav' + in_fpath = input(str(message).replace("\"", '').replace("\'", '')) ## Computing the embedding @@ -147,8 +146,7 @@ ## Generating the spectrogram - #text = input("Write a sentence (+-20 words) to be synthesized:\n") - text = 'Hello. This is Eliza Cassan, coming to you live from Picus TV. The future is now.' + text = input("Write a sentence (+-20 words) to be synthesized:\n") # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] From 1f8eeab33e23b4896d5b1757f7d3525e6a271b26 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 02:07:11 -0700 Subject: [PATCH 10/43] update of variables to fit tensorflow 2.0 (functions untoched, just variables) --- requirements.txt | 2 +- synthesizer/inference.py | 4 ++-- synthesizer/models/attention.py | 8 ++++---- synthesizer/models/modules.py | 36 ++++++++++++++++----------------- synthesizer/models/tacotron.py | 18 ++++++++--------- synthesizer/tacotron2.py | 20 +++++++++--------- 6 files changed, 44 insertions(+), 44 deletions(-) diff --git a/requirements.txt b/requirements.txt index c3d9bf25b..a31f459f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ Unidecode inflect PyQt5 multiprocess -numba +numba==0.48.0 unidecode PySoundFile diff --git a/synthesizer/inference.py b/synthesizer/inference.py index 99fb77810..37562c453 100644 --- a/synthesizer/inference.py +++ b/synthesizer/inference.py @@ -54,7 +54,7 @@ def load(self): """ if self._low_mem: raise Exception("Cannot load the synthesizer permanently in low mem mode") - tf.reset_default_graph() + tf.compat.v1.reset_default_graph() self._model = Tacotron2(self.checkpoint_fpath, hparams) def synthesize_spectrograms(self, texts: List[str], @@ -88,7 +88,7 @@ def synthesize_spectrograms(self, texts: List[str], @staticmethod def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts): # Load the model and forward the inputs - tf.reset_default_graph() + tf.compat.v1.reset_default_graph() model = Tacotron2(checkpoint_fpath, hparams) specs, alignments = model.my_synthesize(embeddings, texts) diff --git a/synthesizer/models/attention.py b/synthesizer/models/attention.py index 58892ad74..1f40d4563 100644 --- a/synthesizer/models/attention.py +++ b/synthesizer/models/attention.py @@ -60,10 +60,10 @@ def _location_sensitive_score(W_query, W_fil, W_keys): dtype = W_query.dtype num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1] - v_a = tf.get_variable( + v_a = tf.compat.v1.get_variable( "attention_variable_projection", shape=[num_units], dtype=dtype, initializer=tf.contrib.layers.xavier_initializer()) - b_a = tf.get_variable( + b_a = tf.compat.v1.get_variable( "attention_bias", shape=[num_units], dtype=dtype, initializer=tf.zeros_initializer()) @@ -155,10 +155,10 @@ def __init__(self, probability_fn=normalization_function, name=name) - self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters, + self.location_convolution = tf.compat.v1.layers.Conv1D(filters=hparams.attention_filters, kernel_size=hparams.attention_kernel, padding="same", use_bias=True, bias_initializer=tf.zeros_initializer(), name="location_features_convolution") - self.location_layer = tf.layers.Dense(units=num_units, use_bias=False, + self.location_layer = tf.compat.v1.layers.Dense(units=num_units, use_bias=False, dtype=tf.float32, name="location_features_layer") self._cumulate = cumulate_weights diff --git a/synthesizer/models/modules.py b/synthesizer/models/modules.py index 769657220..236f139b5 100644 --- a/synthesizer/models/modules.py +++ b/synthesizer/models/modules.py @@ -6,12 +6,12 @@ def __init__(self, units, name=None): self.units = units self.scope = "HighwayNet" if name is None else name - self.H_layer = tf.layers.Dense(units=self.units, activation=tf.nn.relu, name="H") - self.T_layer = tf.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T", + self.H_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.relu, name="H") + self.T_layer = tf.compat.v1.layers.Dense(units=self.units, activation=tf.nn.sigmoid, name="T", bias_initializer=tf.constant_initializer(-1.)) def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): H = self.H_layer(inputs) T = self.T_layer(inputs) return H * T + inputs * (1. - T) @@ -38,8 +38,8 @@ def __init__(self, K, conv_channels, pool_size, projections, projection_kernel_s self._bw_cell = tf.nn.rnn_cell.GRUCell(rnn_units, name="{}_backward_RNN".format(self.scope)) def __call__(self, inputs, input_lengths): - with tf.variable_scope(self.scope): - with tf.variable_scope("conv_bank"): + with tf.compat.v1.variable_scope(self.scope): + with tf.compat.v1.variable_scope("conv_bank"): # Convolution bank: concatenate on the last axis to stack channels from all # convolutions # The convolution bank uses multiple different kernel sizes to have many insights @@ -71,7 +71,7 @@ def __call__(self, inputs, input_lengths): # Additional projection in case of dimension mismatch (for HighwayNet "residual" # connection) if highway_input.shape[2] != self.highway_units: - highway_input = tf.layers.dense(highway_input, self.highway_units) + highway_input = tf.compat.v1.layers.Dense(highway_input, self.highway_units) # 4-layer HighwayNet for highwaynet in self.highwaynet_layers: @@ -88,7 +88,7 @@ def __call__(self, inputs, input_lengths): return tf.concat(outputs, axis=2) # Concat forward and backward outputs -class ZoneoutLSTMCell(tf.nn.rnn_cell.RNNCell): +class ZoneoutLSTMCell(tf.compat.v1.nn.rnn_cell.RNNCell): """Wrapper for tf LSTM to create Zoneout LSTM Cell inspired by: @@ -153,7 +153,7 @@ def __call__(self, inputs, state, scope=None): c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h - new_state = tf.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c, + new_state = tf.compat.v1.nn.rnn_cell.LSTMStateTuple(c, h) if self.state_is_tuple else tf.concat(1, [c, h]) return output, new_state @@ -184,7 +184,7 @@ def __init__(self, is_training, hparams, activation=tf.nn.relu, scope=None): self.enc_conv_num_layers = hparams.enc_conv_num_layers def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): x = inputs for i in range(self.enc_conv_num_layers): x = conv1d(x, self.kernel_size, self.channels, self.activation, @@ -226,7 +226,7 @@ def __init__(self, is_training, size=256, zoneout=0.1, scope=None): name="encoder_bw_LSTM") def __call__(self, inputs, input_lengths): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn( self._fw_cell, self._bw_cell, @@ -263,7 +263,7 @@ def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activati def __call__(self, inputs): x = inputs - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): for i, size in enumerate(self.layers_sizes): dense = tf.layers.dense(x, units=size, activation=self.activation, name="dense_{}".format(i + 1)) @@ -305,7 +305,7 @@ def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None): self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True) def __call__(self, inputs, states): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): return self._cell(inputs, states) @@ -327,14 +327,14 @@ def __init__(self, shape=80, activation=None, scope=None): self.activation = activation self.scope = "Linear_projection" if scope is None else scope - self.dense = tf.layers.Dense(units=shape, activation=activation, + self.dense = tf.compat.v1.layers.Dense(units=shape, activation=activation, name="projection_{}".format(self.scope)) def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): # If activation==None, this returns a simple Linear projection # else the projection will be passed through an activation function - # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation, + # output = tf.compat.v1.layers.Dense(inputs, units=self.shape, activation=self.activation, # name="projection_{}".format(self.scope)) output = self.dense(inputs) @@ -362,7 +362,7 @@ def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None): self.scope = "stop_token_projection" if scope is None else scope def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): output = tf.layers.dense(inputs, units=self.shape, activation=None, name="projection_{}".format(self.scope)) @@ -399,7 +399,7 @@ def __init__(self, is_training, hparams, activation=tf.nn.tanh, scope=None): self.drop_rate = hparams.tacotron_dropout_rate def __call__(self, inputs): - with tf.variable_scope(self.scope): + with tf.compat.v1.variable_scope(self.scope): x = inputs for i in range(self.postnet_num_layers - 1): x = conv1d(x, self.kernel_size, self.channels, self.activation, @@ -412,7 +412,7 @@ def __call__(self, inputs): def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope): - with tf.variable_scope(scope): + with tf.compat.v1.variable_scope(scope): conv1d_output = tf.layers.conv1d( inputs, filters=channels, diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py index 9c4de4df0..6f5eac3bb 100644 --- a/synthesizer/models/tacotron.py +++ b/synthesizer/models/tacotron.py @@ -120,9 +120,9 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): - with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): - with tf.variable_scope("inference") as scope: + with tf.compat.v1.variable_scope("inference") as scope: assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled") if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training: assert global_step is not None @@ -132,7 +132,7 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, post_condition = hp.predict_linear and not gta # Embeddings ==> [batch_size, sequence_length, embedding_dim] - self.embedding_table = tf.get_variable( + self.embedding_table = tf.compat.v1.get_variable( "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32) embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i]) @@ -283,7 +283,7 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets - self.all_vars = tf.trainable_variables() + self.all_vars = tf.compat.v1.trainable_variables() log("Initialized Tacotron model. Dimensions (? = dynamic shape): ") log(" Train mode: {}".format(is_training)) @@ -331,9 +331,9 @@ def add_loss(self): range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): - with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): - with tf.variable_scope("loss") as scope: + with tf.compat.v1.variable_scope("loss") as scope: if hp.mask_decoder: # Compute loss of predictions before postnet before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i], @@ -439,7 +439,7 @@ def add_optimizer(self, global_step): grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0] with tf.device(grad_device): - with tf.variable_scope("optimizer") as scope: + with tf.compat.v1.variable_scope("optimizer") as scope: if hp.tacotron_decay_learning_rate: self.decay_steps = hp.tacotron_decay_steps self.decay_rate = hp.tacotron_decay_rate @@ -454,10 +454,10 @@ def add_optimizer(self, global_step): # 2. Compute Gradient for i in range(hp.tacotron_num_gpus): # Device placement - with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])): # agg_loss += self.tower_loss[i] - with tf.variable_scope("optimizer") as scope: + with tf.compat.v1.variable_scope("optimizer") as scope: gradients = optimizer.compute_gradients(self.tower_loss[i]) tower_gradients.append(gradients) diff --git a/synthesizer/tacotron2.py b/synthesizer/tacotron2.py index 4a5b199ce..e4c68505f 100644 --- a/synthesizer/tacotron2.py +++ b/synthesizer/tacotron2.py @@ -12,13 +12,13 @@ class Tacotron2: def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"): log("Constructing model: %s" % model_name) #Force the batch size to be known in order to use attention masking in batch synthesis - inputs = tf.placeholder(tf.int32, (None, None), name="inputs") - input_lengths = tf.placeholder(tf.int32, (None,), name="input_lengths") - speaker_embeddings = tf.placeholder(tf.float32, (None, hparams.speaker_embedding_size), + inputs = tf.compat.v1.placeholder(tf.int32, (None, None), name="inputs") + input_lengths = tf.compat.v1.placeholder(tf.int32, (None,), name="input_lengths") + speaker_embeddings = tf.compat.v1.placeholder(tf.float32, (None, hparams.speaker_embedding_size), name="speaker_embeddings") - targets = tf.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") - split_infos = tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") - with tf.variable_scope("Tacotron_model") as scope: + targets = tf.compat.v1.placeholder(tf.float32, (None, None, hparams.num_mels), name="mel_targets") + split_infos = tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos") + with tf.compat.v1.variable_scope("Tacotron_model") as scope: self.model = create_model(model_name, hparams) if gta: self.model.initialize(inputs, input_lengths, speaker_embeddings, targets, gta=gta, @@ -52,14 +52,14 @@ def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"): log("Loading checkpoint: %s" % checkpoint_path) #Memory allocation on the GPUs as needed - config = tf.ConfigProto() + config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True - self.session = tf.Session(config=config) - self.session.run(tf.global_variables_initializer()) + self.session = tf.compat.v1.Session(config=config) + self.session.run(tf.compat.v1.global_variables_initializer()) - saver = tf.train.Saver() + saver = tf.compat.v1.train.Saver() saver.restore(self.session, checkpoint_path) def my_synthesize(self, speaker_embeds, texts): From 9add1ef3ffd59d04a394ead6b0494b6cec716833 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 02:39:28 -0700 Subject: [PATCH 11/43] partial update for compatible tensorflow api v2 --- requirements.txt | 6 +++++- synthesizer/models/modules.py | 12 ++++++------ synthesizer/models/tacotron.py | 6 +++--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index a31f459f5..5053c1d86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,4 +20,8 @@ PySoundFile ## Possible linux fix for webrtcvad failure #sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip ## Possible linux fix for 'OSError: PortAudio library not found' -# sudo apt-get install libportaudio2 \ No newline at end of file +# sudo apt-get install libportaudio2 + +tf.compat.v1.layers.dense + +tf.compat.v1.nn.rnn_cell.MultiRNNCell \ No newline at end of file diff --git a/synthesizer/models/modules.py b/synthesizer/models/modules.py index 236f139b5..cb41ccadd 100644 --- a/synthesizer/models/modules.py +++ b/synthesizer/models/modules.py @@ -109,7 +109,7 @@ def __init__(self, num_units, is_training, zoneout_factor_cell=0., zoneout_facto if zm < 0. or zs > 1.: raise ValueError("One/both provided Zoneout factors are not in [0, 1]") - self._cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name) + self._cell = tf.compat.v1.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name) self._zoneout_cell = zoneout_factor_cell self._zoneout_outputs = zoneout_factor_output self.is_training = is_training @@ -227,7 +227,7 @@ def __init__(self, is_training, size=256, zoneout=0.1, scope=None): def __call__(self, inputs, input_lengths): with tf.compat.v1.variable_scope(self.scope): - outputs, (fw_state, bw_state) = tf.nn.bidirectional_dynamic_rnn( + outputs, (fw_state, bw_state) = tf.compat.v1.nn.bidirectional_dynamic_rnn( self._fw_cell, self._bw_cell, inputs, @@ -265,7 +265,7 @@ def __call__(self, inputs): with tf.compat.v1.variable_scope(self.scope): for i, size in enumerate(self.layers_sizes): - dense = tf.layers.dense(x, units=size, activation=self.activation, + dense = tf.compat.v1.layers.dense(x, units=size, activation=self.activation, name="dense_{}".format(i + 1)) # The paper discussed introducing diversity in generation at inference time # by using a dropout of 0.5 only in prenet layers (in both training and inference). @@ -302,7 +302,7 @@ def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None): name="decoder_LSTM_{}".format(i + 1)) for i in range(layers)] - self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True) + self._cell = tf.compat.v1.nn.rnn_cell.MultiRNNCell(self.rnn_layers, state_is_tuple=True) def __call__(self, inputs, states): with tf.compat.v1.variable_scope(self.scope): @@ -413,13 +413,13 @@ def __call__(self, inputs): def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, scope): with tf.compat.v1.variable_scope(scope): - conv1d_output = tf.layers.conv1d( + conv1d_output = tf.compat.v1.layers.conv1d( inputs, filters=channels, kernel_size=kernel_size, activation=None, padding="same") - batched = tf.layers.batch_normalization(conv1d_output, training=is_training) + batched = tf.compat.v1.layers.batch_normalization(conv1d_output, training=is_training) activated = activation(batched) return tf.layers.dropout(activated, rate=drop_rate, training=is_training, name="dropout_{}".format(scope)) diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py index 6f5eac3bb..b516157b5 100644 --- a/synthesizer/models/tacotron.py +++ b/synthesizer/models/tacotron.py @@ -83,10 +83,10 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, ############## - p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) - p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]], + p_inputs = tf.compat.v1.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) + p_mel_targets = tf.compat.v1.py_func(split_func, [mel_targets, split_infos[:, 1]], lout_float) if mel_targets is not None else mel_targets - p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]], + p_stop_token_targets = tf.compat.v1.py_func(split_func, [stop_token_targets, split_infos[:, 2]], lout_float) if stop_token_targets is not None else \ stop_token_targets From d7218d86846a252205de36124ce2c6c5f072cff4 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 28 Apr 2020 02:40:16 -0700 Subject: [PATCH 12/43] cleanup --- requirements.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5053c1d86..a31f459f5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,8 +20,4 @@ PySoundFile ## Possible linux fix for webrtcvad failure #sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip ## Possible linux fix for 'OSError: PortAudio library not found' -# sudo apt-get install libportaudio2 - -tf.compat.v1.layers.dense - -tf.compat.v1.nn.rnn_cell.MultiRNNCell \ No newline at end of file +# sudo apt-get install libportaudio2 \ No newline at end of file From df70389d25e50fc2194f3a67649a8a9e4a3f435b Mon Sep 17 00:00:00 2001 From: pusalieth Date: Wed, 29 Apr 2020 00:29:40 -0700 Subject: [PATCH 13/43] update to gitignore -exclude .gz --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index eaaeb517c..20bbb03f8 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ encoder/saved_models/* synthesizer/saved_models/* vocoder/saved_models/* *.bak +*.gz + From 3a5925aae425f441b963b8744742275398c98535 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Wed, 29 Apr 2020 01:00:46 -0700 Subject: [PATCH 14/43] demo_toolbox Path fix --- requirements.txt | 6 +++++- toolbox/ui.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a31f459f5..8486e2b25 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,5 +19,9 @@ PySoundFile ## Possible linux fix for webrtcvad failure #sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip + ## Possible linux fix for 'OSError: PortAudio library not found' -# sudo apt-get install libportaudio2 \ No newline at end of file +# sudo apt-get install libportaudio2 + +# Fix for "This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem." +# sudo apt-get install libxkbcommon-x11-dev \ No newline at end of file diff --git a/toolbox/ui.py b/toolbox/ui.py index c40fca500..9b77b3db4 100644 --- a/toolbox/ui.py +++ b/toolbox/ui.py @@ -188,7 +188,7 @@ def browse_file(self): caption="Select an audio file", filter="Audio Files (*.mp3 *.flac *.wav *.m4a)" ) - return Path(fpath[0]) if fpath[0] != "" else "" + return str(fpath[0]) if fpath[0] != "" else "" @staticmethod def repopulate_box(box, items, random=False): From ead237a1915cd3d3d6746254dbe73734a4f36089 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Wed, 29 Apr 2020 02:14:05 -0700 Subject: [PATCH 15/43] setting install config -- need to run on native linux --- .gitignore | 6 ++++-- requirements.txt | 8 +------- setup.sh | 1 + 3 files changed, 6 insertions(+), 9 deletions(-) create mode 100644 setup.sh diff --git a/.gitignore b/.gitignore index 20bbb03f8..81bc270e4 100644 --- a/.gitignore +++ b/.gitignore @@ -14,10 +14,12 @@ *.bcf *.toc *.wav -*.sh encoder/saved_models/* synthesizer/saved_models/* vocoder/saved_models/* *.bak *.gz - +LibriSpeech/* +*.txt +*.TXT +*.flac diff --git a/requirements.txt b/requirements.txt index 8486e2b25..a43243d7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -# python3.7 or less +# python3.7 confirmed tensorflow==1.15 umap-learn visdom @@ -17,11 +17,5 @@ numba==0.48.0 unidecode PySoundFile -## Possible linux fix for webrtcvad failure -#sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip - -## Possible linux fix for 'OSError: PortAudio library not found' -# sudo apt-get install libportaudio2 - # Fix for "This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem." # sudo apt-get install libxkbcommon-x11-dev \ No newline at end of file diff --git a/setup.sh b/setup.sh new file mode 100644 index 000000000..841c97644 --- /dev/null +++ b/setup.sh @@ -0,0 +1 @@ +sudo apt -y install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip libportaudio2 \ No newline at end of file From f43696868ebe37e81e3498c6c3353cd9201d5b33 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Thu, 30 Apr 2020 17:24:16 -0700 Subject: [PATCH 16/43] windows fix -- issue with spaces in path for input file Testing: -- remove all packages from requirements that aren't neccesary for demo_cli.py. Slowly add them back in. --- README.md | 2 ++ requirements.txt | 35 ++++++++++++++++++----------------- setup.bat | 10 ++++++++++ setup.sh | 18 +++++++++++++++++- 4 files changed, 47 insertions(+), 18 deletions(-) create mode 100644 setup.bat diff --git a/README.md b/README.md index 280beb68c..43cd39bcf 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,8 @@ You will need the following whether you plan to use the toolbox only or to retra Run `pip install -r requirements.txt` to install the necessary packages. Additionally you will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1). +To use preconfigured setup batch instructions, please use .sh and .bat for linux and windows respectively. + ### Pretrained models Download the latest [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). diff --git a/requirements.txt b/requirements.txt index a43243d7a..202ad2647 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,22 @@ -# python3.7 confirmed +# python3.7.x (6,7) confirmed +unidecode +inflect tensorflow==1.15 -umap-learn -visdom -webrtcvad-wheels -librosa>=0.5.1 -matplotlib>=2.0.2 numpy>=1.14.0 -scipy>=1.0.0 -tqdm -sounddevice -Unidecode -inflect -PyQt5 -multiprocess -numba==0.48.0 -unidecode +matplotlib>=2.0.2 +librosa>=0.5.1 PySoundFile +multiprocess +webrtcvad +sounddevice + + -# Fix for "This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem." -# sudo apt-get install libxkbcommon-x11-dev \ No newline at end of file +#umap-learn +#visdom +#webrtcvad-wheels +#scipy>=1.0.0 +#tqdm +#PyQt5 +#numba==0.48.0 +'C:\Users\jakep\Music\Eliza Cassan\Eliza Cassan Clipped.wav' \ No newline at end of file diff --git a/setup.bat b/setup.bat new file mode 100644 index 000000000..78b6e877b --- /dev/null +++ b/setup.bat @@ -0,0 +1,10 @@ +curl https://www.python.org/ftp/python/3.7.7/python-3.7.7-amd64.exe -o %userprofile%/Downloads/python-3.7.7-amd64.exe +%userprofile%/Downloads/python-3.7.7-amd64.exe + +curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o %userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe +%userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe + +conda -y install pytorch +start cmd /k %userprofile%/miniconda3/Scripts/activate base +cd /D "%~dp0" +pip install -r requirements.txt \ No newline at end of file diff --git a/setup.sh b/setup.sh index 841c97644..2b365daf2 100644 --- a/setup.sh +++ b/setup.sh @@ -1 +1,17 @@ -sudo apt -y install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip libportaudio2 \ No newline at end of file +conda_installed=$(conda list | grep ) +if [ $conda_installed != '' ]; then + wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o ~/Downloads + chmod +x ~/Downloads/Miniconda3-latest-Linux-x86_64.sh + ./Miniconda3-latest-Linux-x86_64.sh + conda create -n RTVC python + conda activate RTVC +fi + +sudo apt -y install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip libportaudio2 +conda update +conda install pytorch -c pytorch +python3.7 -m pip install -r requirements.txt +echo "Finished installation" + +# Fix for "This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem." +# sudo apt-get install libxkbcommon-x11-dev From d3ed21258fb6d53adc49f46e36f5029e90db26c3 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Thu, 30 Apr 2020 17:33:27 -0700 Subject: [PATCH 17/43] linux install script fixed --- requirements.txt | 1 - setup.sh | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 202ad2647..92f4e6633 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,3 @@ sounddevice #tqdm #PyQt5 #numba==0.48.0 -'C:\Users\jakep\Music\Eliza Cassan\Eliza Cassan Clipped.wav' \ No newline at end of file diff --git a/setup.sh b/setup.sh index 2b365daf2..5118fcc2e 100644 --- a/setup.sh +++ b/setup.sh @@ -1,5 +1,5 @@ -conda_installed=$(conda list | grep ) -if [ $conda_installed != '' ]; then +conda_installed=$(conda list | grep 'conda: command not found') +if [ $conda_installed != 'conda: command not found' ]; then wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o ~/Downloads chmod +x ~/Downloads/Miniconda3-latest-Linux-x86_64.sh ./Miniconda3-latest-Linux-x86_64.sh From 64da7dd92fb0d0ba3d2d9ea563711299ee458e43 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Thu, 30 Apr 2020 17:35:04 -0700 Subject: [PATCH 18/43] logical error fix --- setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.sh b/setup.sh index 5118fcc2e..36b0f901d 100644 --- a/setup.sh +++ b/setup.sh @@ -1,5 +1,5 @@ conda_installed=$(conda list | grep 'conda: command not found') -if [ $conda_installed != 'conda: command not found' ]; then +if [ $conda_installed != '' ]; then wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o ~/Downloads chmod +x ~/Downloads/Miniconda3-latest-Linux-x86_64.sh ./Miniconda3-latest-Linux-x86_64.sh From 04533aafe98fccad0216ffa60d0659d1124aa0b3 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Thu, 30 Apr 2020 18:21:31 -0700 Subject: [PATCH 19/43] linux setup script working from scratch --- setup.sh | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/setup.sh b/setup.sh index 36b0f901d..1b319b77a 100644 --- a/setup.sh +++ b/setup.sh @@ -1,17 +1,18 @@ conda_installed=$(conda list | grep 'conda: command not found') -if [ $conda_installed != '' ]; then - wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o ~/Downloads - chmod +x ~/Downloads/Miniconda3-latest-Linux-x86_64.sh - ./Miniconda3-latest-Linux-x86_64.sh - conda create -n RTVC python - conda activate RTVC +if [ '$conda_installed' != '' ]; then + wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + chmod +x Miniconda3-latest-Linux-x86_64.sh + Miniconda3-latest-Linux-x86_64.sh + mv Miniconda3-latest-Linux-x86_64.sh ~/Downloads fi -sudo apt -y install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip libportaudio2 -conda update -conda install pytorch -c pytorch +conda install pytorch python3.7 -m pip install -r requirements.txt +sudo apt -y install libportaudio2 echo "Finished installation" -# Fix for "This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem." +# Possible fix for "This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem." # sudo apt-get install libxkbcommon-x11-dev + +## Possible fix for webrtcvad failure +#sudo apt install python3 python3-dev build-essential libssl-dev libffi-dev libxml2-dev libxslt1-dev zlib1g-dev python-pip \ No newline at end of file From 1267e75e179317f29f9e82b46e5a290014aec725 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Thu, 30 Apr 2020 18:58:59 -0700 Subject: [PATCH 20/43] GUI fixed tested demo_toolbox.py working synthesis and vocode --- requirements.txt | 9 ++++----- toolbox/__init__.py | 9 +++++++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 92f4e6633..823131d6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,13 +9,12 @@ PySoundFile multiprocess webrtcvad sounddevice +PyQt5 +umap-learn - - -#umap-learn +## tested demo_cli.py and demo_toolbox.py +## Unused requirements #visdom -#webrtcvad-wheels #scipy>=1.0.0 #tqdm -#PyQt5 #numba==0.48.0 diff --git a/toolbox/__init__.py b/toolbox/__init__.py index 607fa72a0..a199c4b03 100644 --- a/toolbox/__init__.py +++ b/toolbox/__init__.py @@ -8,6 +8,7 @@ import numpy as np import traceback import sys +import os # Use this directory structure for your datasets, or modify it to fit your needs @@ -102,7 +103,11 @@ def load_from_browser(self, fpath=None): self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name) - name = str(fpath.relative_to(self.datasets_root)) + + if(str(self.datasets_root)[0] == '/'): + name = str(fpath.relative_to(self.datasets_root)) + else: + name = os.getcwd() + '/' + str(self.datasets_root) speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name # Select the next utterance @@ -116,7 +121,7 @@ def load_from_browser(self, fpath=None): # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for # playback, so as to have a fair comparison with the generated audio - wav = Synthesizer.load_preprocess_wav(fpath) + wav = Synthesizer.load_preprocess_wav(os.getcwd() + '/' + str(fpath)) self.ui.log("Loaded %s" % name) self.add_real_utterance(wav, name, speaker_name) From e85224ca0d946adbe6d321a41e5c1489281e4fa9 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 1 May 2020 18:02:08 -0700 Subject: [PATCH 21/43] - performance enhancement for cpu and gpu - fpath fix for load from file - updated calls for librosa 0.8.0 (changed wav write to file) - partial update for tensorflow 2.0 api --- demo_cli.py | 4 +- synthesizer/models/modules.py | 8 +- synthesizer/models/tacotron.py | 290 ++++++++++++++++++--------------- toolbox/__init__.py | 16 +- 4 files changed, 177 insertions(+), 141 deletions(-) diff --git a/demo_cli.py b/demo_cli.py index e6c2028d3..49f5d2433 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -5,6 +5,7 @@ from vocoder import inference as vocoder from pathlib import Path import numpy as np +import soundfile as sf import librosa import argparse import torch @@ -178,8 +179,7 @@ # Save it on the disk fpath = "demo_output_%02d.wav" % num_generated print(generated_wav.dtype) - librosa.output.write_wav(fpath, generated_wav.astype(np.float32), - synthesizer.sample_rate) + sf.write(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate) num_generated += 1 print("\nSaved output as %s\n\n" % fpath) diff --git a/synthesizer/models/modules.py b/synthesizer/models/modules.py index cb41ccadd..3096fbfcd 100644 --- a/synthesizer/models/modules.py +++ b/synthesizer/models/modules.py @@ -1,4 +1,5 @@ import tensorflow as tf +import torch class HighwayNet: @@ -108,8 +109,11 @@ def __init__(self, num_units, is_training, zoneout_factor_cell=0., zoneout_facto if zm < 0. or zs > 1.: raise ValueError("One/both provided Zoneout factors are not in [0, 1]") - - self._cell = tf.compat.v1.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=state_is_tuple, name=name) + + if torch.cuda.is_available(): + self._cell = tf.contrib.cudnn_rnn.CudnnLSTM(num_units, name=name) + else: + self._cell = tf.contrib.rnn.LSTMBlockCell(num_units, name=name) self._zoneout_cell = zoneout_factor_cell self._zoneout_outputs = zoneout_factor_output self.is_training = is_training diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py index b516157b5..4738ffcc4 100644 --- a/synthesizer/models/tacotron.py +++ b/synthesizer/models/tacotron.py @@ -24,11 +24,11 @@ def split_func(x, split_pos): class Tacotron(): """Tacotron-2 Feature prediction Model. """ - + def __init__(self, hparams): self._hparams = hparams - - def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, + + def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False, global_step=None, is_training=False, is_evaluating=False, split_infos=None): """ @@ -45,55 +45,59 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, entries in the mel spectrogram. Only needed for training. """ if mel_targets is None and stop_token_targets is not None: - raise ValueError("no multi targets were provided but token_targets were given") + raise ValueError( + "no multi targets were provided but token_targets were given") if mel_targets is not None and stop_token_targets is None and not gta: - raise ValueError("Mel targets are provided without corresponding token_targets") + raise ValueError( + "Mel targets are provided without corresponding token_targets") if not gta and self._hparams.predict_linear == True and linear_targets is None and \ - is_training: + is_training: raise ValueError( "Model is set to use post processing to predict linear spectrograms in training " - "but no linear targets given!") + "but no linear targets given!") if gta and linear_targets is not None: - raise ValueError("Linear spectrogram prediction is not supported in GTA mode!") + raise ValueError( + "Linear spectrogram prediction is not supported in GTA mode!") if is_training and self._hparams.mask_decoder and targets_lengths is None: raise RuntimeError( "Model set to mask paddings but no targets lengths provided for the mask!") if is_training and is_evaluating: raise RuntimeError( "Model can not be in training and evaluation modes at the same time!") - + split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \ - self._hparams.split_on_cpu else "/gpu:{}".format( - self._hparams.tacotron_gpu_start_idx) + self._hparams.split_on_cpu else "/gpu:{}".format( + self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus lout_float = [tf.float32] * hp.tacotron_num_gpus - + tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) tower_targets_lengths = \ tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \ - targets_lengths is not None else targets_lengths - + targets_lengths is not None else targets_lengths + ### SV2TTS ### - + tower_embed_targets = tf.split(embed_targets, num_or_size_splits=hp.tacotron_num_gpus, axis=0) - + ############## - - p_inputs = tf.compat.v1.py_func(split_func, [inputs, split_infos[:, 0]], lout_int) - p_mel_targets = tf.compat.v1.py_func(split_func, [mel_targets, split_infos[:, 1]], - lout_float) if mel_targets is not None else mel_targets - p_stop_token_targets = tf.compat.v1.py_func(split_func, [stop_token_targets, split_infos[:, 2]], - lout_float) if stop_token_targets is not None else \ - stop_token_targets - + + p_inputs = tf.numpy_function( + split_func, [inputs, split_infos[:, 0]], lout_int) + p_mel_targets = tf.numpy_function(split_func, [mel_targets, split_infos[:, 1]], + lout_float) if mel_targets is not None else mel_targets + p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]], + lout_float) if stop_token_targets is not None else \ + stop_token_targets + tower_inputs = [] tower_mel_targets = [] tower_stop_token_targets = [] - + batch_size = tf.shape(inputs)[0] mel_channels = hp.num_mels for i in range(hp.tacotron_num_gpus): @@ -104,69 +108,74 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, if p_stop_token_targets is not None: tower_stop_token_targets.append( tf.reshape(p_stop_token_targets[i], [batch_size, -1])) - + self.tower_decoder_output = [] self.tower_alignments = [] self.tower_stop_token_prediction = [] self.tower_mel_outputs = [] - + tower_embedded_inputs = [] tower_enc_conv_output_shape = [] tower_encoder_cond_outputs = [] tower_residual = [] tower_projected_residual = [] - + # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", - worker_device=gpus[i])): + worker_device=gpus[i])): with tf.compat.v1.variable_scope("inference") as scope: - assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled") + assert hp.tacotron_teacher_forcing_mode in ( + "constant", "scheduled") if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training: assert global_step is not None - - # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit + + # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit # post processing when doing GTA synthesis post_condition = hp.predict_linear and not gta - + # Embeddings ==> [batch_size, sequence_length, embedding_dim] self.embedding_table = tf.compat.v1.get_variable( "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32) - embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i]) - + embedded_inputs = tf.nn.embedding_lookup( + self.embedding_table, tower_inputs[i]) + # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units] encoder_cell = TacotronEncoderCell( - EncoderConvolutions(is_training, hparams=hp, scope="encoder_convolutions"), + EncoderConvolutions( + is_training, hparams=hp, scope="encoder_convolutions"), EncoderRNN(is_training, size=hp.encoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope="encoder_LSTM")) - - encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i]) - + + encoder_outputs = encoder_cell( + embedded_inputs, tower_input_lengths[i]) + # For shape visualization purpose enc_conv_output_shape = encoder_cell.conv_output_shape - - + ### SV2TT2 ### - + # Append the speaker embedding to the encoder output at each timestep - tileable_shape = [-1, 1, self._hparams.speaker_embedding_size] - tileable_embed_targets = tf.reshape(tower_embed_targets[i], tileable_shape) - tiled_embed_targets = tf.tile(tileable_embed_targets, - [1, tf.shape(encoder_outputs)[1], 1]) - encoder_cond_outputs = tf.concat((encoder_outputs, tiled_embed_targets), 2) - + tileable_shape = [-1, 1, + self._hparams.speaker_embedding_size] + tileable_embed_targets = tf.reshape( + tower_embed_targets[i], tileable_shape) + tiled_embed_targets = tf.tile(tileable_embed_targets, + [1, tf.shape(encoder_outputs)[1], 1]) + encoder_cond_outputs = tf.concat( + (encoder_outputs, tiled_embed_targets), 2) + ############## - - + # Decoder Parts # Attention Decoder Prenet prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope="decoder_prenet") # Attention Mechanism attention_mechanism = LocationSensitiveAttention(hp.attention_dim, - encoder_cond_outputs, + encoder_cond_outputs, hparams=hp, mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape( @@ -186,7 +195,7 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, stop_projection = StopProjection(is_training or is_evaluating, shape=hp .outputs_per_step, scope="stop_token_projection") - + # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding) decoder_cell = TacotronDecoderCell( prenet, @@ -194,86 +203,93 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, decoder_lstm, frame_projection, stop_projection) - + # Define the helper for our decoder if is_training or is_evaluating or gta: self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step) else: self.helper = TacoTestHelper(batch_size, hp) - + # initial decoder state decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32) - + # Only use max iterations at synthesis time - max_iters = hp.max_iters if not (is_training or is_evaluating) else None - + max_iters = hp.max_iters if not ( + is_training or is_evaluating) else None + # Decode (frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode( - CustomDecoder(decoder_cell, self.helper, decoder_init_state), + CustomDecoder(decoder_cell, self.helper, + decoder_init_state), impute_finished=False, maximum_iterations=max_iters, swap_memory=hp.tacotron_swap_with_cpu) - - # Reshape outputs to be one output per entry + + # Reshape outputs to be one output per entry # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels] - decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels]) - stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1]) - + decoder_output = tf.reshape( + frames_prediction, [batch_size, -1, hp.num_mels]) + stop_token_prediction = tf.reshape( + stop_token_prediction, [batch_size, -1]) + # Postnet - postnet = Postnet(is_training, hparams=hp, scope="postnet_convolutions") - - # Compute residual using post-net ==> [batch_size, decoder_steps * r, + postnet = Postnet(is_training, hparams=hp, + scope="postnet_convolutions") + + # Compute residual using post-net ==> [batch_size, decoder_steps * r, # postnet_channels] residual = postnet(decoder_output) - - # Project residual to same dimension as mel spectrogram + + # Project residual to same dimension as mel spectrogram # ==> [batch_size, decoder_steps * r, num_mels] - residual_projection = FrameProjection(hp.num_mels, scope="postnet_projection") + residual_projection = FrameProjection( + hp.num_mels, scope="postnet_projection") projected_residual = residual_projection(residual) - + # Compute the mel spectrogram mel_outputs = decoder_output + projected_residual - + if post_condition: - # Add post-processing CBHG. This does a great job at extracting features - # from mels before projection to Linear specs. + # Add post-processing CBHG. This does a great job at extracting features + # from mels before projection to Linear specs. post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels], hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers, hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training, name="CBHG_postnet") - + # [batch_size, decoder_steps(mel_frames), cbhg_channels] post_outputs = post_cbhg(mel_outputs, None) - + # Linear projection of extracted features to make linear spectrogram linear_specs_projection = FrameProjection(hp.num_freq, scope="cbhg_linear_specs_projection") - + # [batch_size, decoder_steps(linear_frames), num_freq] linear_outputs = linear_specs_projection(post_outputs) - + # Grab alignments from the final decoder state alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0]) - + self.tower_decoder_output.append(decoder_output) self.tower_alignments.append(alignments) - self.tower_stop_token_prediction.append(stop_token_prediction) + self.tower_stop_token_prediction.append( + stop_token_prediction) self.tower_mel_outputs.append(mel_outputs) tower_embedded_inputs.append(embedded_inputs) tower_enc_conv_output_shape.append(enc_conv_output_shape) tower_encoder_cond_outputs.append(encoder_cond_outputs) tower_residual.append(residual) tower_projected_residual.append(projected_residual) - + if post_condition: self.tower_linear_outputs.append(linear_outputs) log("initialisation done {}".format(gpus[i])) - + if is_training: self.ratio = self.helper._ratio self.tower_inputs = tower_inputs @@ -282,44 +298,53 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, # self.tower_linear_targets = tower_linear_targets self.tower_targets_lengths = tower_targets_lengths self.tower_stop_token_targets = tower_stop_token_targets - + self.all_vars = tf.compat.v1.trainable_variables() - + log("Initialized Tacotron model. Dimensions (? = dynamic shape): ") log(" Train mode: {}".format(is_training)) log(" Eval mode: {}".format(is_evaluating)) log(" GTA mode: {}".format(gta)) - log(" Synthesis mode: {}".format(not (is_training or is_evaluating))) + log(" Synthesis mode: {}".format( + not (is_training or is_evaluating))) log(" Input: {}".format(inputs.shape)) for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx): log(" device: {}".format(i)) - log(" embedding: {}".format(tower_embedded_inputs[i].shape)) - log(" enc conv out: {}".format(tower_enc_conv_output_shape[i])) - log(" encoder out (cond): {}".format(tower_encoder_cond_outputs[i].shape)) - log(" decoder out: {}".format(self.tower_decoder_output[i].shape)) - log(" residual out: {}".format(tower_residual[i].shape)) - log(" projected residual out: {}".format(tower_projected_residual[i].shape)) - log(" mel out: {}".format(self.tower_mel_outputs[i].shape)) + log(" embedding: {}".format( + tower_embedded_inputs[i].shape)) + log(" enc conv out: {}".format( + tower_enc_conv_output_shape[i])) + log(" encoder out (cond): {}".format( + tower_encoder_cond_outputs[i].shape)) + log(" decoder out: {}".format( + self.tower_decoder_output[i].shape)) + log(" residual out: {}".format( + tower_residual[i].shape)) + log(" projected residual out: {}".format( + tower_projected_residual[i].shape)) + log(" mel out: {}".format( + self.tower_mel_outputs[i].shape)) if post_condition: - log(" linear out: {}".format(self.tower_linear_outputs[i].shape)) - log(" out: {}".format(self.tower_stop_token_prediction[i].shape)) - + log(" linear out: {}".format( + self.tower_linear_outputs[i].shape)) + log(" out: {}".format( + self.tower_stop_token_prediction[i].shape)) + # 1_000_000 is causing syntax problems for some people?! Python please :) log(" Tacotron Parameters {:.3f} Million.".format( np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000)) - - + def add_loss(self): """Adds loss to the model. Sets "loss" field. initialize must have been called.""" hp = self._hparams - + self.tower_before_loss = [] self.tower_after_loss = [] self.tower_stop_token_loss = [] self.tower_regularization_loss = [] self.tower_linear_loss = [] self.tower_loss = [] - + total_before_loss = 0 total_after_loss = 0 total_stop_token_loss = 0 @@ -329,10 +354,10 @@ def add_loss(self): gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] - + for i in range(hp.tacotron_num_gpus): with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", - worker_device=gpus[i])): + worker_device=gpus[i])): with tf.compat.v1.variable_scope("loss") as scope: if hp.mask_decoder: # Compute loss of predictions before postnet @@ -365,15 +390,16 @@ def add_loss(self): stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( labels=self.tower_stop_token_targets[i], logits=self.tower_stop_token_prediction[i])) - + # SV2TTS extra L1 loss - l1 = tf.abs(self.tower_mel_targets[i] - self.tower_decoder_output[i]) + l1 = tf.abs( + self.tower_mel_targets[i] - self.tower_decoder_output[i]) linear_loss = tf.reduce_mean(l1) # if hp.predict_linear: # # Compute linear loss # # From https://github.com/keithito/tacotron/blob/tacotron2-work-in - # # -progress/models/tacotron.py + # # -progress/models/tacotron.py # # Prioritize loss for frequencies under 2000 Hz. # l1 = tf.abs(self.tower_linear_targets[i] - self.tower_linear_outputs[i]) # n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_freq) @@ -381,34 +407,34 @@ def add_loss(self): # l1[:, :, 0:n_priority_freq]) # else: # linear_loss = 0. - + # Compute the regularization weight if hp.tacotron_scale_regularization: reg_weight_scaler = 1. / ( - 2 * hp.max_abs_value) if hp.symmetric_mels else 1. / ( + 2 * hp.max_abs_value) if hp.symmetric_mels else 1. / ( hp.max_abs_value) reg_weight = hp.tacotron_reg_weight * reg_weight_scaler else: reg_weight = hp.tacotron_reg_weight - + # Regularize variables # Exclude all types of bias, RNN (Bengio et al. On the difficulty of training recurrent neural networks), embeddings and prediction projection layers. # Note that we consider attention mechanism v_a weights as a prediction projection layer and we don"t regularize it. (This gave better stability) regularization = tf.add_n([tf.nn.l2_loss(v) for v in self.all_vars if not ( - "bias" in v.name or "Bias" in v.name or "_projection" in v.name or "inputs_embedding" in v.name - or "RNN" in v.name or "LSTM" in v.name)]) * reg_weight - + "bias" in v.name or "Bias" in v.name or "_projection" in v.name or "inputs_embedding" in v.name + or "RNN" in v.name or "LSTM" in v.name)]) * reg_weight + # Compute final loss term self.tower_before_loss.append(before) self.tower_after_loss.append(after) self.tower_stop_token_loss.append(stop_token_loss) self.tower_regularization_loss.append(regularization) self.tower_linear_loss.append(linear_loss) - + loss = before + after + stop_token_loss + regularization + linear_loss self.tower_loss.append(loss) - + for i in range(hp.tacotron_num_gpus): total_before_loss += self.tower_before_loss[i] total_after_loss += self.tower_after_loss[i] @@ -416,14 +442,14 @@ def add_loss(self): total_regularization_loss += self.tower_regularization_loss[i] total_linear_loss += self.tower_linear_loss[i] total_loss += self.tower_loss[i] - + self.before_loss = total_before_loss / hp.tacotron_num_gpus self.after_loss = total_after_loss / hp.tacotron_num_gpus self.stop_token_loss = total_stop_token_loss / hp.tacotron_num_gpus self.regularization_loss = total_regularization_loss / hp.tacotron_num_gpus self.linear_loss = total_linear_loss / hp.tacotron_num_gpus self.loss = total_loss / hp.tacotron_num_gpus - + def add_optimizer(self, global_step): """Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called. Args: @@ -431,13 +457,13 @@ def add_optimizer(self, global_step): """ hp = self._hparams tower_gradients = [] - + # 1. Declare GPU Devices gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] - + grad_device = "/cpu:0" if hp.tacotron_num_gpus > 1 else gpus[0] - + with tf.device(grad_device): with tf.compat.v1.variable_scope("optimizer") as scope: if hp.tacotron_decay_learning_rate: @@ -446,21 +472,22 @@ def add_optimizer(self, global_step): self.learning_rate = self._learning_rate_decay( hp.tacotron_initial_learning_rate, global_step) else: - self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate) - + self.learning_rate = tf.convert_to_tensor( + hp.tacotron_initial_learning_rate) + optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon) - + # 2. Compute Gradient for i in range(hp.tacotron_num_gpus): # Device placement with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", - worker_device=gpus[i])): + worker_device=gpus[i])): # agg_loss += self.tower_loss[i] with tf.compat.v1.variable_scope("optimizer") as scope: gradients = optimizer.compute_gradients(self.tower_loss[i]) tower_gradients.append(gradients) - + # 3. Average Gradient with tf.device(grad_device): avg_grads = [] @@ -475,40 +502,41 @@ def add_optimizer(self, global_step): # Average over the "tower" dimension. grad = tf.concat(axis=0, values=grads) grad = tf.reduce_mean(grad, 0) - + v = grad_and_vars[0][1] avg_grads.append(grad) vars.append(v) - + self.gradients = avg_grads # Just for causion # https://github.com/Rayhane-mamah/Tacotron-2/issues/11 if hp.tacotron_clip_gradients: - clipped_gradients, _ = tf.clip_by_global_norm(avg_grads, 1.) # __mark 0.5 refer + clipped_gradients, _ = tf.clip_by_global_norm( + avg_grads, 1.) # __mark 0.5 refer else: clipped_gradients = avg_grads - + # Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See: # https://github.com/tensorflow/tensorflow/issues/1122 with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars), global_step=global_step) - + def _learning_rate_decay(self, init_lr, global_step): ################################################################# # Narrow Exponential Decay: - + # Phase 1: lr = 1e-3 # We only start learning rate decay after 50k steps - + # Phase 2: lr in ]1e-5, 1e-3[ # decay reach minimal value at step 310k - + # Phase 3: lr = 1e-5 # clip by minimal learning rate value (step > 310k) ################################################################# hp = self._hparams - + # Compute natural exponential decay lr = tf.train.exponential_decay(init_lr, global_step - hp.tacotron_start_decay, @@ -516,6 +544,6 @@ def _learning_rate_decay(self, init_lr, global_step): self.decay_steps, self.decay_rate, # lr = 1e-5 around step 310k name="lr_exponential_decay") - + # clip learning rate by max and min values (initial and final values) return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr) diff --git a/toolbox/__init__.py b/toolbox/__init__.py index a199c4b03..972f73102 100644 --- a/toolbox/__init__.py +++ b/toolbox/__init__.py @@ -9,6 +9,7 @@ import traceback import sys import os +import soundfile as sf # Use this directory structure for your datasets, or modify it to fit your needs @@ -104,10 +105,10 @@ def load_from_browser(self, fpath=None): self.ui.current_speaker_name, self.ui.current_utterance_name) - if(str(self.datasets_root)[0] == '/'): + if(str(self.datasets_root)[0] == '/' or str(self.datasets_root)[1] == ':'): name = str(fpath.relative_to(self.datasets_root)) else: - name = os.getcwd() + '/' + str(self.datasets_root) + name = os.getcwd() + '/' + str(fpath) speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name # Select the next utterance @@ -116,14 +117,14 @@ def load_from_browser(self, fpath=None): elif fpath == "": return else: - name = fpath.name - speaker_name = fpath.parent.name + name = str(fpath).replace('\\', '/') + speaker_name = 'Custom' # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for # playback, so as to have a fair comparison with the generated audio - wav = Synthesizer.load_preprocess_wav(os.getcwd() + '/' + str(fpath)) + wav = Synthesizer.load_preprocess_wav(name) self.ui.log("Loaded %s" % name) - + self.filename = os.path.basename(name) self.add_real_utterance(wav, name, speaker_name) def record(self): @@ -216,6 +217,9 @@ def vocoder_progress(i, seq_len, b_size, gen_rate): wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) + # Save it + sf.write('./Custom_%s.wav' % self.filename, wav, Synthesizer.sample_rate) + # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): From d8dae25ce91b0f0a9e4045735eaa5f09a968c71c Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 1 May 2020 21:32:36 -0700 Subject: [PATCH 22/43] -updated all core functions, and variables to be compatible with tensorflow v2 api -- big issue -- The RNN model makes heavy usage of tf.contrib, which was dropped in v2. All the methods were changed, variables, etc. Will require extensive rework. Possible rewrite. --- synthesizer/models/modules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synthesizer/models/modules.py b/synthesizer/models/modules.py index 3096fbfcd..7efc1a651 100644 --- a/synthesizer/models/modules.py +++ b/synthesizer/models/modules.py @@ -273,7 +273,7 @@ def __call__(self, inputs): name="dense_{}".format(i + 1)) # The paper discussed introducing diversity in generation at inference time # by using a dropout of 0.5 only in prenet layers (in both training and inference). - x = tf.layers.dropout(dense, rate=self.drop_rate, training=True, + x = tf.compat.v1.layers.dropout(dense, rate=self.drop_rate, training=True, name="dropout_{}".format(i + 1) + self.scope) return x @@ -425,7 +425,7 @@ def conv1d(inputs, kernel_size, channels, activation, is_training, drop_rate, sc padding="same") batched = tf.compat.v1.layers.batch_normalization(conv1d_output, training=is_training) activated = activation(batched) - return tf.layers.dropout(activated, rate=drop_rate, training=is_training, + return tf.compat.v1.layers.dropout(activated, rate=drop_rate, training=is_training, name="dropout_{}".format(scope)) From 5de3fa3f0b1d3ffb7167e9bc0da1c2e6ecd1247b Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 1 May 2020 21:41:27 -0700 Subject: [PATCH 23/43] notify the user of cpu usage --- demo_cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/demo_cli.py b/demo_cli.py index 49f5d2433..cac78b7ad 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -58,6 +58,7 @@ print("Your PyTorch installation is not configured. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation.", file=sys.stderr) + print("\nIf you're trying to use a cpu, please use the option --cpu.", file=sys.stderr) quit(-1) From a33f0e87fb543d2f3e479558f05a8084e894871b Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 1 May 2020 22:05:59 -0700 Subject: [PATCH 24/43] readme update -- grammar -- easier to understand -- better instructions for usage --- README.md | 60 +++++++++++++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 43cd39bcf..729437f09 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,9 @@ # Real-Time Voice Cloning -This repository is an implementation of [Transfer Learning from Speaker Verification to -Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. Feel free to check [my thesis](https://matheo.uliege.be/handle/2268.2/6801) if you're curious or if you're looking for info I haven't documented yet (don't hesitate to make an issue for that too). Mostly I would recommend giving a quick look to the figures beyond the introduction. +This repository is an implementation of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. Feel free to check [my thesis](https://matheo.uliege.be/handle/2268.2/6801) if you're curious, or if you're looking for info I haven't documented yet. Mostly I would recommend giving a quick look to the figures beyond the introduction. -SV2TTS is a three-stage deep learning framework that allows to create a numerical representation of a voice from a few seconds of audio, and to use it to condition a text-to-speech model trained to generalize to new voices. - -**Video demonstration** (click the picture): +SV2TTS is a three-stage deep learning framework that allows the creation of a numerical representation of a voice from a few seconds of audio, then use that data to condition a text-to-speech model trained to generate new voices. +**Video demonstration** (click the play button): [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA) @@ -18,49 +16,41 @@ SV2TTS is a three-stage deep learning framework that allows to create a numerica |[1712.05884](https://arxiv.org/pdf/1712.05884.pdf) | Tacotron 2 (synthesizer) | Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions | [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2) |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo | -## News -**13/11/19**: I'm sorry that I can't maintain this repo as much as I wish I could. I'm working full time on improving voice cloning techniques and I don't have the time to share my improvements here. Plus this repo relies on a lot of old tensorflow code and it's hard to work with. If you're a researcher, then this repo might be of use to you. **If you just want to clone your voice**, do check our demo on [Resemble.AI](https://www.resemble.ai/) - it will give much better results than this repo and will not require a complex setup. - -**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder. You can use your trained encoder models from this repo with it. -**06/07/19:** Need to run within a docker container on a remote server? See [here](https://sean.lane.sh/posts/2019/07/Running-the-Real-Time-Voice-Cloning-project-in-Docker/). +## Get Started +### Requirements +Please use the setup.sh or setup.bat if your on linux and windows respictevly to install the dependancies, and requirements. Currently only python 3.7.x is supported. -**25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM. +#### Install Manually: +You will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1) installed first, then run `pip install -r requirements.txt` to install the necessary packages. +### After install Steps +Next you will need pretrained models if you don't plan to train your own. Those can be downloaded at [models](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). These models were trained with a cuda device, so they'll produce finiky result for a cpu. New models will need to be produced first. (As of 5/1/20) Download the models, and uncompress them in this root folder. They should result in /encoder/saved_models, /synthesizer/saved_models, and /vocoder/saved_models. -## Quick start -### Requirements -You will need the following whether you plan to use the toolbox only or to retrain the models. +### Test installation +When you believe you have all the neccesary soup, test the program by running `python demo_cli.py`. +If all tests pass, you're good to go. To use the cpu, use the option `--cpu`. -**Python 3.7**. Python 3.6 might work too, but I wouldn't go lower because I make extensive use of pathlib. +### Generate Audio from dataset +There are a few preconfigured options for datasets. One in perticular, [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz) is made to work from demo_toolbox.py. When you download this dataset, you can locate the directory anywhere, but creating a folder in this directory named datasets is recommended. -Run `pip install -r requirements.txt` to install the necessary packages. Additionally you will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1). +To run the toolbox, use `python demo_toolbox.py -d datasets` if you followed the recommendation for directory location. Otherwise, include the full path to the dataset. -To use preconfigured setup batch instructions, please use .sh and .bat for linux and windows respectively. +To set the speaker, you'll need an input audio file. use browse in the toolbox to your personal audio file, or record to set your own voice. -### Pretrained models -Download the latest [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). +The toolbox supports other datasets, including [dev-train](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). -### Preliminary -Before you download any dataset, you can begin by testing your configuration with: +If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). -`python demo_cli.py` - -If all tests pass, you're good to go. +## Contributions & Issues -To use the cpu, use the option `--cpu`. -### Datasets -For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `/LibriSpeech/train-clean-100` where `` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). If you chose to use your own dataset you'll need your own audio files, or you will have to record it with the toolbox. -### Toolbox -You can then try the toolbox: +## Original Author CorentinJ News +**13/11/19**: I'm sorry that I can't maintain this repo as much as I wish I could. I'm working full time as of June 2019 on improving voice cloning techniques and I don't have the time to share my improvements here. Plus this repo relies on a lot of old tensorflow code and it's hard to work with. If you're a researcher, then this repo might be of use to you. **If you just want to clone your voice**, do check our demo on [Resemble.AI](https://www.resemble.ai/) - it will give much better results than this repo and will not require a complex setup. -`python demo_toolbox.py -d ` -or -`python demo_toolbox.py` +**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder. You can use your trained encoder models from this repo with it. -depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590). +**06/07/19:** Need to run within a docker container on a remote server? See [here](https://sean.lane.sh/posts/2019/07/Running-the-Real-Time-Voice-Cloning-project-in-Docker/). -## Contributions & Issues -I'm working full-time as of June 2019. I don't have time to maintain this repo nor reply to issues. Sorry. +**25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM. \ No newline at end of file From e90269339a99259252a2e92d817824313fdaeffa Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 1 May 2020 22:12:02 -0700 Subject: [PATCH 25/43] readme update -- spelling corrections -- formatting --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 729437f09..cd90d49e0 100644 --- a/README.md +++ b/README.md @@ -19,20 +19,22 @@ SV2TTS is a three-stage deep learning framework that allows the creation of a nu ## Get Started ### Requirements -Please use the setup.sh or setup.bat if your on linux and windows respictevly to install the dependancies, and requirements. Currently only python 3.7.x is supported. +Please use the setup.sh or setup.bat if you're on linux and windows respictevly to install the dependancies, and requirements. Currently only python 3.7.x is supported. #### Install Manually: You will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1) installed first, then run `pip install -r requirements.txt` to install the necessary packages. ### After install Steps -Next you will need pretrained models if you don't plan to train your own. Those can be downloaded at [models](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models). These models were trained with a cuda device, so they'll produce finiky result for a cpu. New models will need to be produced first. (As of 5/1/20) Download the models, and uncompress them in this root folder. They should result in /encoder/saved_models, /synthesizer/saved_models, and /vocoder/saved_models. +Next you will need [pretrained models](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models) if you don't plan to train your own. +These models were trained on a cuda device, so they'll produce finicky results for a cpu. New CPU models will need to be produced first. (As of 5/1/20) +Download the models, and uncompress them in this root folder. If done correctly, it should result as /encoder/saved_models, /synthesizer/saved_models, and /vocoder/saved_models. ### Test installation When you believe you have all the neccesary soup, test the program by running `python demo_cli.py`. If all tests pass, you're good to go. To use the cpu, use the option `--cpu`. ### Generate Audio from dataset -There are a few preconfigured options for datasets. One in perticular, [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz) is made to work from demo_toolbox.py. When you download this dataset, you can locate the directory anywhere, but creating a folder in this directory named datasets is recommended. +There are a few preconfigured options for datasets. One in perticular, [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz) is made to work from demo_toolbox.py. When you download this dataset, you can locate the directory anywhere, but creating a folder in this directory named `datasets` is recommended. To run the toolbox, use `python demo_toolbox.py -d datasets` if you followed the recommendation for directory location. Otherwise, include the full path to the dataset. From 53a821d3b2de212d7abb3311999b2e87318b4d84 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 1 May 2020 22:12:58 -0700 Subject: [PATCH 26/43] readme update -- formatting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cd90d49e0..9e7d97dad 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ You will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1) inst ### After install Steps Next you will need [pretrained models](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models) if you don't plan to train your own. These models were trained on a cuda device, so they'll produce finicky results for a cpu. New CPU models will need to be produced first. (As of 5/1/20) -Download the models, and uncompress them in this root folder. If done correctly, it should result as /encoder/saved_models, /synthesizer/saved_models, and /vocoder/saved_models. +Download the models, and uncompress them in this root folder. If done correctly, it should result as `/encoder/saved_models`, `/synthesizer/saved_models`, and `/vocoder/saved_models`. ### Test installation When you believe you have all the neccesary soup, test the program by running `python demo_cli.py`. From a8f0781498cbe4989a708ec8acf34d4b3b2fb50d Mon Sep 17 00:00:00 2001 From: pusalieth Date: Sat, 2 May 2020 00:41:02 -0700 Subject: [PATCH 27/43] initializing for rocm support (amd rnn) --- requirements.txt | 4 ++++ setup.sh | 21 ++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 823131d6e..cb6910ea7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,10 @@ sounddevice PyQt5 umap-learn +## AMD CPU support in tensorflow 2.0 +# tensorflow-rocm +# rocm-dkms + ## tested demo_cli.py and demo_toolbox.py ## Unused requirements #visdom diff --git a/setup.sh b/setup.sh index 1b319b77a..ebeee18f1 100644 --- a/setup.sh +++ b/setup.sh @@ -9,9 +9,28 @@ fi conda install pytorch python3.7 -m pip install -r requirements.txt sudo apt -y install libportaudio2 + +## Future AMD setup (needs tensorflow api v2) +amd='FALSE' +if [ $amd == 'TRUE' ]; then + sudo apt update + sudo apt -y dist-upgrade + sudo apt install libnuma-dev + sudo reboot + + wget -q -O - https://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | sudo apt-key add - + echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/debian/ xenial main' | sudo tee /etc/apt/sources.list.d/rocm.list + sudo apt update + sudo apt install rocm-dkms + sudo usermod -a -G video $LOGNAME + echo 'ADD_EXTRA_GROUPS=1' | sudo tee -a /etc/adduser.conf + echo 'EXTRA_GROUPS=video' | sudo tee -a /etc/adduser.conf + sudo reboot +fi + echo "Finished installation" -# Possible fix for "This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem." +## Possible fix for "This application failed to start because no Qt platform plugin could be initialized. Reinstalling the application may fix this problem." # sudo apt-get install libxkbcommon-x11-dev ## Possible fix for webrtcvad failure From 1cc8f2b882c86a49d981e7b7afbc96a89a111219 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Mon, 4 May 2020 18:51:48 -0700 Subject: [PATCH 28/43] -- working cpu train for synthesizer --- .gitignore | 1 + demo_cli.py | 6 ++-- demo_toolbox.py | 5 ++-- encoder/train.py | 7 +++-- encoder_preprocess.py | 4 +-- encoder_train.py | 2 +- requirements.txt | 9 ++++-- synthesizer/feeder.py | 18 ++++++------ synthesizer/models/helpers.py | 2 +- synthesizer/models/modules.py | 6 ++-- synthesizer/models/tacotron.py | 10 +++---- synthesizer/preprocess.py | 8 +++-- synthesizer/train.py | 50 ++++++++++++++++---------------- synthesizer_preprocess_audio.py | 2 +- synthesizer_preprocess_embeds.py | 2 +- synthesizer_train.py | 2 +- 16 files changed, 70 insertions(+), 64 deletions(-) diff --git a/.gitignore b/.gitignore index 81bc270e4..e3f1b81bc 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ *.bcf *.toc *.wav +datasets/* encoder/saved_models/* synthesizer/saved_models/* vocoder/saved_models/* diff --git a/demo_cli.py b/demo_cli.py index cac78b7ad..26a743eea 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -178,11 +178,11 @@ sd.play(generated_wav, synthesizer.sample_rate) # Save it on the disk - fpath = "demo_output_%02d.wav" % num_generated + filename = "demo_output_%02d.wav" % num_generated print(generated_wav.dtype) - sf.write(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate) + sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate) num_generated += 1 - print("\nSaved output as %s\n\n" % fpath) + print("\nSaved output as %s\n\n" % filename) except Exception as e: diff --git a/demo_toolbox.py b/demo_toolbox.py index 485c1366d..57e0a1f48 100644 --- a/demo_toolbox.py +++ b/demo_toolbox.py @@ -10,12 +10,11 @@ formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - parser.add_argument("-d", "--datasets_root", type=Path, help= \ + parser.add_argument("-d", "--datasets_root", type=Path, default="./datasets/", help= \ "Path to the directory containing your datasets. See toolbox/__init__.py for a list of " "supported datasets. You can add your own data by created a directory named UserAudio " "in your datasets root. Supported formats are mp3, flac, wav and m4a. Each speaker should " - "be inside a directory, e.g. /UserAudio/speaker_01/audio_01.wav.", - default=None) + "be inside a directory, e.g. /UserAudio/speaker_01/audio_01.wav.") parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models", help="Directory containing saved encoder models") parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models", diff --git a/encoder/train.py b/encoder/train.py index 071af1b9c..87f2642c4 100644 --- a/encoder/train.py +++ b/encoder/train.py @@ -7,11 +7,12 @@ import torch def sync(device: torch.device): - # FIXME - return # For correct profiling (cuda operations are async) if device.type == "cuda": torch.cuda.synchronize(device) + else: + torch.cpu.synchronize(device) + def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int, backup_every: int, vis_every: int, force_restart: bool, visdom_server: str, @@ -30,7 +31,7 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, # hyperparameters) faster on the CPU. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # FIXME: currently, the gradient is None if loss_device is cuda - loss_device = torch.device("cpu") + loss_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Create the model and the optimizer model = SpeakerEncoder(device, loss_device) diff --git a/encoder_preprocess.py b/encoder_preprocess.py index f69f3200a..9fe0a6655 100644 --- a/encoder_preprocess.py +++ b/encoder_preprocess.py @@ -24,12 +24,12 @@ class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptio " -dev", formatter_class=MyFormatter ) - parser.add_argument("datasets_root", type=Path, help=\ + parser.add_argument('-d', "--datasets_root", type=Path, default='./datasets/', help=\ "Path to the directory containing your LibriSpeech/TTS and VoxCeleb datasets.") parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\ "Path to the output directory that will contain the mel spectrograms. If left out, " "defaults to /SV2TTS/encoder/") - parser.add_argument("-d", "--datasets", type=str, + parser.add_argument("-dt", "--datasets_type", type=str, default="librispeech_other,voxceleb1,voxceleb2", help=\ "Comma-separated list of the name of the datasets you want to preprocess. Only the train " "set of these datasets will be used. Possible names: librispeech_other, voxceleb1, " diff --git a/encoder_train.py b/encoder_train.py index b8740a894..c2b65cc6b 100644 --- a/encoder_train.py +++ b/encoder_train.py @@ -14,7 +14,7 @@ "Name for this model instance. If a model state from the same run ID was previously " "saved, the training will restart from there. Pass -f to overwrite saved states and " "restart from scratch.") - parser.add_argument("clean_data_root", type=Path, help= \ + parser.add_argument("-d", "--clean_data_root", type=Path, default='./datasets/SV2TTS/encoder/', help= \ "Path to the output directory of encoder_preprocess.py. If you left the default " "output directory when preprocessing, it should be /SV2TTS/encoder/.") parser.add_argument("-m", "--models_dir", type=Path, default="encoder/saved_models/", help=\ diff --git a/requirements.txt b/requirements.txt index cb6910ea7..a1b251803 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,12 @@ # python3.7.x (6,7) confirmed +# each portion of tensorflow is neeed +# core package is for RNN, cpu and gpu are for specific system speed-ups +tensorflow==1.15 +tensorflow-cpu==1.15 +tensorflow-gpu==1.15 +# dependancies unidecode inflect -tensorflow==1.15 numpy>=1.14.0 matplotlib>=2.0.2 librosa>=0.5.1 @@ -11,6 +16,7 @@ webrtcvad sounddevice PyQt5 umap-learn +visdom ## AMD CPU support in tensorflow 2.0 # tensorflow-rocm @@ -18,7 +24,6 @@ umap-learn ## tested demo_cli.py and demo_toolbox.py ## Unused requirements -#visdom #scipy>=1.0.0 #tqdm #numba==0.48.0 diff --git a/synthesizer/feeder.py b/synthesizer/feeder.py index 6fc1b2022..b1acb3d54 100644 --- a/synthesizer/feeder.py +++ b/synthesizer/feeder.py @@ -70,22 +70,22 @@ def __init__(self, coordinator, metadata_filename, hparams): # Create placeholders for inputs and targets. Don"t specify batch size because we want # to be able to feed different batch sizes at eval time. self._placeholders = [ - tf.placeholder(tf.int32, shape=(None, None), name="inputs"), - tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"), - tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), + tf.compat.v1.placeholder(tf.int32, shape=(None, None), name="inputs"), + tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="input_lengths"), + tf.compat.v1.placeholder(tf.float32, shape=(None, None, hparams.num_mels), name="mel_targets"), - tf.placeholder(tf.float32, shape=(None, None), name="token_targets"), - tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), - tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), + tf.compat.v1.placeholder(tf.float32, shape=(None, None), name="token_targets"), + tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="targets_lengths"), + tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), name="split_infos"), # SV2TTS - tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), + tf.compat.v1.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), name="speaker_embeddings") ] # Create queue for buffering data - queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, + queue = tf.queue.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32], name="input_queue") self._enqueue_op = queue.enqueue(self._placeholders) self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \ @@ -100,7 +100,7 @@ def __init__(self, coordinator, metadata_filename, hparams): self.speaker_embeddings.set_shape(self._placeholders[6].shape) # Create eval queue for buffering eval data - eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, + eval_queue = tf.queue.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32, tf.int32, tf.int32, tf.float32], name="eval_queue") self._eval_enqueue_op = eval_queue.enqueue(self._placeholders) self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \ diff --git a/synthesizer/models/helpers.py b/synthesizer/models/helpers.py index eec069940..4e58ccd00 100644 --- a/synthesizer/models/helpers.py +++ b/synthesizer/models/helpers.py @@ -119,7 +119,7 @@ def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, n #Pick previous outputs randomly with respect to teacher forcing ratio next_inputs = tf.cond( - tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), + tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio), lambda: self._targets[:, time, :], #Teacher-forcing: return true frame lambda: outputs[:,-self._output_dim:]) diff --git a/synthesizer/models/modules.py b/synthesizer/models/modules.py index 7efc1a651..6ea9f1a10 100644 --- a/synthesizer/models/modules.py +++ b/synthesizer/models/modules.py @@ -148,10 +148,8 @@ def __call__(self, inputs, state, scope=None): if self.is_training: # nn.dropout takes keep_prob (probability to keep activations) not drop_prob ( # probability to mask activations)! - c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, - (1 - self._zoneout_cell)) + prev_c - h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, - (1 - self._zoneout_outputs)) + prev_h + c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, (1 - self._zoneout_cell)) + prev_c + h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, (1 - self._zoneout_outputs)) + prev_h else: c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py index 4738ffcc4..c3bcdd0f8 100644 --- a/synthesizer/models/tacotron.py +++ b/synthesizer/models/tacotron.py @@ -381,10 +381,10 @@ def add_loss(self): linear_loss = 0. else: # Compute loss of predictions before postnet - before = tf.losses.mean_squared_error(self.tower_mel_targets[i], + before = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i], self.tower_decoder_output[i]) # Compute loss after postnet - after = tf.losses.mean_squared_error(self.tower_mel_targets[i], + after = tf.compat.v1.losses.mean_squared_error(self.tower_mel_targets[i], self.tower_mel_outputs[i]) # Compute loss (for learning dynamic generation stop) stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( @@ -475,7 +475,7 @@ def add_optimizer(self, global_step): self.learning_rate = tf.convert_to_tensor( hp.tacotron_initial_learning_rate) - optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, + optimizer = tf.compat.v1.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1, hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon) # 2. Compute Gradient @@ -518,7 +518,7 @@ def add_optimizer(self, global_step): # Add dependency on UPDATE_OPS; otherwise batchnorm won"t work correctly. See: # https://github.com/tensorflow/tensorflow/issues/1122 - with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): + with tf.control_dependencies(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)): self.optimize = optimizer.apply_gradients(zip(clipped_gradients, vars), global_step=global_step) @@ -538,7 +538,7 @@ def _learning_rate_decay(self, init_lr, global_step): hp = self._hparams # Compute natural exponential decay - lr = tf.train.exponential_decay(init_lr, + lr = tf.compat.v1.train.exponential_decay(init_lr, global_step - hp.tacotron_start_decay, # lr = 1e-3 at step 50k self.decay_steps, diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py index b2894aa71..5378832ee 100644 --- a/synthesizer/preprocess.py +++ b/synthesizer/preprocess.py @@ -8,14 +8,16 @@ from tqdm import tqdm import numpy as np import librosa +import os def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams): # Gather the input directories + if(str(datasets_root)[0] != '/' or str(datasets_root)[1] != ':'): + datasets_root = Path(os.getcwd() + '/' + str(datasets_root)) dataset_root = datasets_root.joinpath("LibriSpeech") - input_dirs = [dataset_root.joinpath("train-clean-100"), - dataset_root.joinpath("train-clean-360")] + input_dirs = [dataset_root.joinpath("train-clean-100"), dataset_root.joinpath("train-clean-360")] print("\n ".join(map(str, ["Using data from:"] + input_dirs))) assert all(input_dir.exists() for input_dir in input_dirs) @@ -82,7 +84,7 @@ def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams) def split_on_silences(wav_fpath, words, end_times, hparams): # Load the audio waveform - wav, _ = librosa.load(wav_fpath, hparams.sample_rate) + wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max diff --git a/synthesizer/train.py b/synthesizer/train.py index 4fe6bbda3..965ed63e7 100644 --- a/synthesizer/train.py +++ b/synthesizer/train.py @@ -33,31 +33,31 @@ def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoi def add_train_stats(model, hparams): - with tf.variable_scope("stats") as scope: + with tf.compat.v1.variable_scope("stats") as scope: for i in range(hparams.tacotron_num_gpus): - tf.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i]) - tf.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i]) - tf.summary.scalar("before_loss", model.before_loss) - tf.summary.scalar("after_loss", model.after_loss) + tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i]) + tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i]) + tf.compat.v1.summary.scalar("before_loss", model.before_loss) + tf.compat.v1.summary.scalar("after_loss", model.after_loss) if hparams.predict_linear: - tf.summary.scalar("linear_loss", model.linear_loss) + tf.compat.v1.summary.scalar("linear_loss", model.linear_loss) for i in range(hparams.tacotron_num_gpus): - tf.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i]) - tf.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i]) + tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i]) + tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i]) - tf.summary.scalar("regularization_loss", model.regularization_loss) - tf.summary.scalar("stop_token_loss", model.stop_token_loss) - tf.summary.scalar("loss", model.loss) - tf.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed + tf.compat.v1.summary.scalar("regularization_loss", model.regularization_loss) + tf.compat.v1.summary.scalar("stop_token_loss", model.stop_token_loss) + tf.compat.v1.summary.scalar("loss", model.loss) + tf.compat.v1.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed if hparams.tacotron_teacher_forcing_mode == "scheduled": - tf.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing + tf.compat.v1.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing # ratio decay when mode = "scheduled" gradient_norms = [tf.norm(grad) for grad in model.gradients] - tf.summary.histogram("gradient_norm", gradient_norms) - tf.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms)) # visualize + tf.compat.v1.summary.histogram("gradient_norm", gradient_norms) + tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms)) # visualize # gradients (in case of explosion) - return tf.summary.merge_all() + return tf.compat.v1.summary.merge_all() def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, @@ -83,7 +83,7 @@ def time_string(): def model_train_mode(args, feeder, hparams, global_step): - with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope: + with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope: model = create_model("Tacotron", hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, feeder.mel_targets, feeder.token_targets, @@ -96,7 +96,7 @@ def model_train_mode(args, feeder, hparams, global_step): def model_test_mode(args, feeder, hparams, global_step): - with tf.variable_scope("Tacotron_model", reuse=tf.AUTO_REUSE) as scope: + with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope: model = create_model("Tacotron", hparams) model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_speaker_embeddings, feeder.eval_mel_targets, @@ -136,11 +136,11 @@ def train(log_dir, args, hparams): log(hparams_debug_string()) # Start by setting a seed for repeatability - tf.set_random_seed(hparams.tacotron_random_seed) + tf.compat.v1.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder coord = tf.train.Coordinator() - with tf.variable_scope("datafeeder") as scope: + with tf.compat.v1.variable_scope("datafeeder") as scope: feeder = Feeder(coord, metadat_fpath, hparams) # Set up model: @@ -164,21 +164,21 @@ def train(log_dir, args, hparams): step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) - saver = tf.train.Saver(max_to_keep=5) + saver = tf.compat.v1.train.Saver(max_to_keep=5) log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps)) # Memory allocation on the GPU as needed - config = tf.ConfigProto() + config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # Train - with tf.Session(config=config) as sess: + with tf.compat.v1.Session(config=config) as sess: try: - summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) + summary_writer = tf.compat.v1.summary.FileWriter(tensorboard_dir, sess.graph) - sess.run(tf.global_variables_initializer()) + sess.run(tf.compat.v1.global_variables_initializer()) # saved model restoring if args.restore: diff --git a/synthesizer_preprocess_audio.py b/synthesizer_preprocess_audio.py index a0dc47b4d..2f52cd86d 100644 --- a/synthesizer_preprocess_audio.py +++ b/synthesizer_preprocess_audio.py @@ -12,7 +12,7 @@ "vocoder for training.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - parser.add_argument("datasets_root", type=Path, help=\ + parser.add_argument('-d', "--datasets_root", type=Path, default='./datasets/', help=\ "Path to the directory containing your LibriSpeech/TTS datasets.") parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\ "Path to the output directory that will contain the mel spectrograms, the audios and the " diff --git a/synthesizer_preprocess_embeds.py b/synthesizer_preprocess_embeds.py index 94f864d5d..afba770d1 100644 --- a/synthesizer_preprocess_embeds.py +++ b/synthesizer_preprocess_embeds.py @@ -9,7 +9,7 @@ description="Creates embeddings for the synthesizer from the LibriSpeech utterances.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - parser.add_argument("synthesizer_root", type=Path, help=\ + parser.add_argument('-d', "--synthesizer_root", type=Path, default='./datasets/SV2TTS/synthesizer/', help=\ "Path to the synthesizer training data that contains the audios and the train.txt file. " "If you let everything as default, it should be /SV2TTS/synthesizer/.") parser.add_argument("-e", "--encoder_model_fpath", type=Path, diff --git a/synthesizer_train.py b/synthesizer_train.py index 4d46bcb80..5c46d1af6 100644 --- a/synthesizer_train.py +++ b/synthesizer_train.py @@ -19,7 +19,7 @@ def prepare_run(args): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("name", help="Name of the run and of the logging directory.") - parser.add_argument("synthesizer_root", type=str, help=\ + parser.add_argument('-d', "--synthesizer_root", type=str, default='./datasets/SV2TTS/synthesizer/', help=\ "Path to the synthesizer training data that contains the audios and the train.txt file. " "If you let everything as default, it should be /SV2TTS/synthesizer/.") parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\ From 9bd717f516e4730219a3f4bd0efd44bc16a2cd3a Mon Sep 17 00:00:00 2001 From: pusalieth Date: Mon, 4 May 2020 20:34:58 -0700 Subject: [PATCH 29/43] fix requirements -- tested from scratch on google compute --- requirements.txt | 1 + setup.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a1b251803..fd528710d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ tensorflow==1.15 tensorflow-cpu==1.15 tensorflow-gpu==1.15 + # dependancies unidecode inflect diff --git a/setup.sh b/setup.sh index ebeee18f1..4a51dc1b7 100644 --- a/setup.sh +++ b/setup.sh @@ -8,7 +8,7 @@ fi conda install pytorch python3.7 -m pip install -r requirements.txt -sudo apt -y install libportaudio2 +sudo apt -y install libportaudio2 gcc ## Future AMD setup (needs tensorflow api v2) amd='FALSE' From 861210ea57c8f6e15d0bf80c7c733f2f232308a4 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Mon, 4 May 2020 20:36:21 -0700 Subject: [PATCH 30/43] -- update of linux setup process order --- setup.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.sh b/setup.sh index 4a51dc1b7..e47feba4f 100644 --- a/setup.sh +++ b/setup.sh @@ -2,13 +2,13 @@ conda_installed=$(conda list | grep 'conda: command not found') if [ '$conda_installed' != '' ]; then wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh chmod +x Miniconda3-latest-Linux-x86_64.sh - Miniconda3-latest-Linux-x86_64.sh - mv Miniconda3-latest-Linux-x86_64.sh ~/Downloads + ./Miniconda3-latest-Linux-x86_64.sh + rm Miniconda3-latest-Linux-x86_64.sh ~/Downloads fi conda install pytorch -python3.7 -m pip install -r requirements.txt sudo apt -y install libportaudio2 gcc +python3.7 -m pip install -r requirements.txt ## Future AMD setup (needs tensorflow api v2) amd='FALSE' From 77abfc7631138adbeb4fedb8cc8eb448a45c411e Mon Sep 17 00:00:00 2001 From: pusalieth Date: Mon, 4 May 2020 22:19:14 -0700 Subject: [PATCH 31/43] save synth chkpt every 50 --- synthesizer/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synthesizer/train.py b/synthesizer/train.py index 965ed63e7..854a9be89 100644 --- a/synthesizer/train.py +++ b/synthesizer/train.py @@ -325,7 +325,7 @@ def train(log_dir, args, hparams): stop_token_loss, eval_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \ - step == 300: + step % 50 == 0: # Save model and current global step saver.save(sess, checkpoint_fpath, global_step=global_step) From 97636a5f603313af6c15455889ccd63eeecf2027 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 5 May 2020 00:32:34 -0700 Subject: [PATCH 32/43] linux dependency update --- setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.sh b/setup.sh index e47feba4f..581213575 100644 --- a/setup.sh +++ b/setup.sh @@ -7,7 +7,7 @@ if [ '$conda_installed' != '' ]; then fi conda install pytorch -sudo apt -y install libportaudio2 gcc +sudo apt -y install libportaudio2 gcc libsndfile1 python3.7 -m pip install -r requirements.txt ## Future AMD setup (needs tensorflow api v2) From fc59b39b5f68baff916567dbe565c18021c894e5 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 5 May 2020 00:32:50 -0700 Subject: [PATCH 33/43] typo --- setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.sh b/setup.sh index 581213575..a2fd712cf 100644 --- a/setup.sh +++ b/setup.sh @@ -3,7 +3,7 @@ if [ '$conda_installed' != '' ]; then wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh chmod +x Miniconda3-latest-Linux-x86_64.sh ./Miniconda3-latest-Linux-x86_64.sh - rm Miniconda3-latest-Linux-x86_64.sh ~/Downloads + rm Miniconda3-latest-Linux-x86_64.sh fi conda install pytorch From 48d47dae6b84ac8f85c0a230a082a7e5e3bee9cf Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 5 May 2020 02:33:46 -0700 Subject: [PATCH 34/43] readme type --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9e7d97dad..7250a46b4 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ SV2TTS is a three-stage deep learning framework that allows the creation of a nu ## Get Started ### Requirements -Please use the setup.sh or setup.bat if you're on linux and windows respictevly to install the dependancies, and requirements. Currently only python 3.7.x is supported. +Please use the setup.sh or setup.bat if you're on linux and windows respectively to install the dependancies, and requirements. Currently only python 3.7.x is supported. #### Install Manually: You will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1) installed first, then run `pip install -r requirements.txt` to install the necessary packages. From 578561755ca939daf52186d46092e4daf081c7b8 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 5 May 2020 02:37:11 -0700 Subject: [PATCH 35/43] readme update --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7250a46b4..daf125f28 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,9 @@ When you believe you have all the neccesary soup, test the program by running `p If all tests pass, you're good to go. To use the cpu, use the option `--cpu`. ### Generate Audio from dataset -There are a few preconfigured options for datasets. One in perticular, [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz) is made to work from demo_toolbox.py. When you download this dataset, you can locate the directory anywhere, but creating a folder in this directory named `datasets` is recommended. +There are a few preconfigured options for datasets. One in perticular, [`LibriSpeech/train-clean-100`](http://www.openslr.org/resources/12/train-clean-100.tar.gz) is made to work from demo_toolbox.py. When you download this dataset, you can locate the directory anywhere, but creating a folder in this directory named `datasets` is recommended. (All scripts will use this directory as default) -To run the toolbox, use `python demo_toolbox.py -d datasets` if you followed the recommendation for directory location. Otherwise, include the full path to the dataset. +To run the toolbox, use `python demo_toolbox.py` if you followed the recommendation for the datasets directory location. Otherwise, include the full path to the dataset and use the option `-d`. To set the speaker, you'll need an input audio file. use browse in the toolbox to your personal audio file, or record to set your own voice. From b606154fce79886f17f8ec2d32f1cdbf31e7c999 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 5 May 2020 13:06:58 -0700 Subject: [PATCH 36/43] --possible amd framework on windows using plaidml --- requirements.txt | 4 ++++ setup.bat | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fd528710d..eb52196a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,10 @@ umap-learn visdom ## AMD CPU support in tensorflow 2.0 +#### win #### +# keras +# plaidml-keras plaidbench +#### linux #### # tensorflow-rocm # rocm-dkms diff --git a/setup.bat b/setup.bat index 78b6e877b..bb71e49de 100644 --- a/setup.bat +++ b/setup.bat @@ -7,4 +7,6 @@ curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o conda -y install pytorch start cmd /k %userprofile%/miniconda3/Scripts/activate base cd /D "%~dp0" -pip install -r requirements.txt \ No newline at end of file +pip install -r requirements.txt + +plaidml-setup \ No newline at end of file From c8fdb740660f4208056304426d332d3173d8c81a Mon Sep 17 00:00:00 2001 From: pusalieth Date: Tue, 5 May 2020 13:38:25 -0700 Subject: [PATCH 37/43] partial update to tf 2.0 --- .vscode/settings.json | 4 ++++ demo_cli.py | 1 - demo_toolbox.py | 1 - .../data_objects/speaker_verification_dataset.py | 1 - encoder/train.py | 1 - encoder/visualizations.py | 1 - encoder_train.py | 1 - synthesizer/inference.py | 1 - synthesizer/preprocess.py | 1 - synthesizer/train.py | 16 ++++++++-------- vocoder_preprocess.py | 1 - vocoder_train.py | 1 - 12 files changed, 12 insertions(+), 18 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..9edfb50f4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python.pythonPath": "C:\\Users\\jakep\\AppData\\Local\\Programs\\Python\\Python37\\python.exe", + "terminal.integrated.shell.windows": "“terminal.integrated.shellArgs.windows”: [“/K”, “C:/ProgramData/Miniconda3/Scripts/activate.bat C:/ProgramData/Miniconda3/”]" +} \ No newline at end of file diff --git a/demo_cli.py b/demo_cli.py index 26a743eea..0bc9c0e26 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -188,4 +188,3 @@ except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n") - \ No newline at end of file diff --git a/demo_toolbox.py b/demo_toolbox.py index 57e0a1f48..8ec9772d4 100644 --- a/demo_toolbox.py +++ b/demo_toolbox.py @@ -29,4 +29,3 @@ # Launch the toolbox print_args(args, parser) Toolbox(**vars(args)) - \ No newline at end of file diff --git a/encoder/data_objects/speaker_verification_dataset.py b/encoder/data_objects/speaker_verification_dataset.py index 77a6e05ea..c1ddd501c 100644 --- a/encoder/data_objects/speaker_verification_dataset.py +++ b/encoder/data_objects/speaker_verification_dataset.py @@ -53,4 +53,3 @@ def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler= def collate(self, speakers): return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames) - \ No newline at end of file diff --git a/encoder/train.py b/encoder/train.py index 87f2642c4..616178b0d 100644 --- a/encoder/train.py +++ b/encoder/train.py @@ -123,4 +123,3 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, }, backup_fpath) profiler.tick("Extras (visualizations, saving)") - \ No newline at end of file diff --git a/encoder/visualizations.py b/encoder/visualizations.py index 980c74f95..266fa3695 100644 --- a/encoder/visualizations.py +++ b/encoder/visualizations.py @@ -175,4 +175,3 @@ def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, def save(self): if not self.disabled: self.vis.save([self.env_name]) - \ No newline at end of file diff --git a/encoder_train.py b/encoder_train.py index c2b65cc6b..b03de9649 100644 --- a/encoder_train.py +++ b/encoder_train.py @@ -44,4 +44,3 @@ # Run the training print_args(args, parser) train(**vars(args)) - \ No newline at end of file diff --git a/synthesizer/inference.py b/synthesizer/inference.py index 37562c453..86bd2fb84 100644 --- a/synthesizer/inference.py +++ b/synthesizer/inference.py @@ -134,4 +134,3 @@ def griffin_lim(mel): with the same parameters present in hparams.py. """ return audio.inv_mel_spectrogram(mel, hparams) - \ No newline at end of file diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py index 5378832ee..0bfc3d6b9 100644 --- a/synthesizer/preprocess.py +++ b/synthesizer/preprocess.py @@ -224,4 +224,3 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) job = Pool(n_processes).imap(func, fpaths) list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) - \ No newline at end of file diff --git a/synthesizer/train.py b/synthesizer/train.py index 854a9be89..846adba68 100644 --- a/synthesizer/train.py +++ b/synthesizer/train.py @@ -53,9 +53,9 @@ def add_train_stats(model, hparams): if hparams.tacotron_teacher_forcing_mode == "scheduled": tf.compat.v1.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing # ratio decay when mode = "scheduled" - gradient_norms = [tf.norm(grad) for grad in model.gradients] + gradient_norms = [tf.norm(tensor=grad) for grad in model.gradients] tf.compat.v1.summary.histogram("gradient_norm", gradient_norms) - tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max(gradient_norms)) # visualize + tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max(input_tensor=gradient_norms)) # visualize # gradients (in case of explosion) return tf.compat.v1.summary.merge_all() @@ -63,18 +63,18 @@ def add_train_stats(model, hparams): def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, loss): values = [ - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss", + tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss", simple_value=before_loss), - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss", + tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss", simple_value=after_loss), - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss", + tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss", simple_value=stop_token_loss), - tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss), + tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss), ] if linear_loss is not None: - values.append(tf.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss", + values.append(tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss", simple_value=linear_loss)) - test_summary = tf.Summary(value=values) + test_summary = tf.compat.v1.Summary(value=values) summary_writer.add_summary(test_summary, step) diff --git a/vocoder_preprocess.py b/vocoder_preprocess.py index 415ed37ac..be12990d5 100644 --- a/vocoder_preprocess.py +++ b/vocoder_preprocess.py @@ -38,4 +38,3 @@ class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptio args.out_dir = os.path.join(args.datasets_root, "SV2TTS", "vocoder") run_synthesis(args.in_dir, args.out_dir, args.model_dir, modified_hp) - \ No newline at end of file diff --git a/vocoder_train.py b/vocoder_train.py index d712ffa3e..55717b597 100644 --- a/vocoder_train.py +++ b/vocoder_train.py @@ -53,4 +53,3 @@ # Run the training print_args(args, parser) train(**vars(args)) - \ No newline at end of file From 364c32aa57878fe42a2e101c78d7280a36fb04cb Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 8 May 2020 16:27:36 -0700 Subject: [PATCH 38/43] minor update -- windows batch updated -- readme updated -- fixed windows terminal load for vs code --- .vscode/settings.json | 5 +++-- README.md | 5 +++++ setup.bat | 22 +++++++++++++++------- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 9edfb50f4..dd7086174 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,5 @@ { - "python.pythonPath": "C:\\Users\\jakep\\AppData\\Local\\Programs\\Python\\Python37\\python.exe", - "terminal.integrated.shell.windows": "“terminal.integrated.shellArgs.windows”: [“/K”, “C:/ProgramData/Miniconda3/Scripts/activate.bat C:/ProgramData/Miniconda3/”]" + "python.pythonPath": "%userprofile%/miniconda3/AppData\\Local\\Programs\\Python\\Python37\\python.exe", + "terminal.integrated.shell.windows": "C:\\Windows\\System32\\cmd.exe", + "terminal.integrated.shellArgs.windows": ["/k", "%userprofile%/miniconda3/Scripts/activate base"] } \ No newline at end of file diff --git a/README.md b/README.md index daf125f28..1ea60c1b9 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,11 @@ SV2TTS is a three-stage deep learning framework that allows the creation of a nu ### Requirements Please use the setup.sh or setup.bat if you're on linux and windows respectively to install the dependancies, and requirements. Currently only python 3.7.x is supported. +* Windows Install Requirements +* * During python installation, make sure python is added to path during installation. +* * During conda installation, make sure you install it 'just for me'. +* * During ms build tools installation, you only need to install the c++ package, which requires around 4.7GB. Upon installation of build tools, you'll need to restart the computer to complete the install process. Rerun the setup.bat to finish the setup process. + #### Install Manually: You will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1) installed first, then run `pip install -r requirements.txt` to install the necessary packages. diff --git a/setup.bat b/setup.bat index bb71e49de..be096daaa 100644 --- a/setup.bat +++ b/setup.bat @@ -1,12 +1,20 @@ -curl https://www.python.org/ftp/python/3.7.7/python-3.7.7-amd64.exe -o %userprofile%/Downloads/python-3.7.7-amd64.exe -%userprofile%/Downloads/python-3.7.7-amd64.exe +if not exist %userprofile%/Downloads/python-3.7.7-amd64.exe ( + curl https://www.python.org/ftp/python/3.7.7/python-3.7.7-amd64.exe -o %userprofile%/Downloads/python-3.7.7-amd64.exe + %userprofile%/Downloads/python-3.7.7-amd64.exe +) -curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o %userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe -%userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe +if not exist %userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe ( + curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o %userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe + %userprofile%/Downloads/Miniconda3-latest-Windows-x86_64.exe +) -conda -y install pytorch -start cmd /k %userprofile%/miniconda3/Scripts/activate base +if not exist %userprofile%/Downloads/vs_BuildTools.exe ( + curl https://download.visualstudio.microsoft.com/download/pr/5e397ebe-38b2-4e18-a187-ac313d07332a/00945fbb0a29f63183b70370043e249218249f83dbc82cd3b46c5646503f9e27/vs_BuildTools.exe -o %userprofile%/Downloads/vs_BuildTools.exe + %userprofile%/Downloads/vs_BuildTools.exe +) + +start cmd /k "%userprofile%/miniconda3/Scripts/activate base & conda install -y pytorch & exit" cd /D "%~dp0" pip install -r requirements.txt -plaidml-setup \ No newline at end of file +:: plaidml-setup \ No newline at end of file From 8ed19db6181dd62694a595c88ac5a5ff12e09f77 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 8 May 2020 16:30:58 -0700 Subject: [PATCH 39/43] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1ea60c1b9..6074d5f23 100644 --- a/README.md +++ b/README.md @@ -22,9 +22,9 @@ SV2TTS is a three-stage deep learning framework that allows the creation of a nu Please use the setup.sh or setup.bat if you're on linux and windows respectively to install the dependancies, and requirements. Currently only python 3.7.x is supported. * Windows Install Requirements -* * During python installation, make sure python is added to path during installation. -* * During conda installation, make sure you install it 'just for me'. -* * During ms build tools installation, you only need to install the c++ package, which requires around 4.7GB. Upon installation of build tools, you'll need to restart the computer to complete the install process. Rerun the setup.bat to finish the setup process. + * During python installation, make sure python is added to path during installation. + * During conda installation, make sure you install it 'just for me'. + * During ms build tools installation, you only need to install the c++ package, which requires around 4.7GB. Upon installation of build tools, you'll need to restart the computer to complete the install process. Rerun the setup.bat to finish the setup process. #### Install Manually: You will need [PyTorch](https://pytorch.org/get-started/locally/) (>=1.0.1) installed first, then run `pip install -r requirements.txt` to install the necessary packages. From a338a537da8d829a6d8dd87eff2c958cda82d18a Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 8 May 2020 17:23:46 -0700 Subject: [PATCH 40/43] -- setup updated easy saved_models download --- setup.bat | 6 ++++++ setup.sh | 6 ++++++ utils/argutils.py | 1 - utils/profiler.py | 1 - 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/setup.bat b/setup.bat index be096daaa..5dc406239 100644 --- a/setup.bat +++ b/setup.bat @@ -13,6 +13,12 @@ if not exist %userprofile%/Downloads/vs_BuildTools.exe ( %userprofile%/Downloads/vs_BuildTools.exe ) +if not exist vocoder/saved_models ( + python -m pip install gdown + gdown https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc + python -c "import zipfile; zipfile.ZipFile('pretrained.zip').extractall()" +) + start cmd /k "%userprofile%/miniconda3/Scripts/activate base & conda install -y pytorch & exit" cd /D "%~dp0" pip install -r requirements.txt diff --git a/setup.sh b/setup.sh index a2fd712cf..b54f4a614 100644 --- a/setup.sh +++ b/setup.sh @@ -6,6 +6,12 @@ if [ '$conda_installed' != '' ]; then rm Miniconda3-latest-Linux-x86_64.sh fi +if [ ! -d "vocoder/saved_models" ]; then + python -m pip install gdown + gdown https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc + python -c "import zipfile; zipfile.ZipFile('pretrained.zip').extractall()" +fi + conda install pytorch sudo apt -y install libportaudio2 gcc libsndfile1 python3.7 -m pip install -r requirements.txt diff --git a/utils/argutils.py b/utils/argutils.py index db4168302..e292769e0 100644 --- a/utils/argutils.py +++ b/utils/argutils.py @@ -37,4 +37,3 @@ def print_args(args: argparse.Namespace, parser=None): param, value = items[i] print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value)) print("") - \ No newline at end of file diff --git a/utils/profiler.py b/utils/profiler.py index 17175b9e1..66ef4033d 100644 --- a/utils/profiler.py +++ b/utils/profiler.py @@ -42,4 +42,3 @@ def summarize(self): print(" %s mean: %4.0fms std: %4.0fms" % (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000)) print("", flush=True) - \ No newline at end of file From 30f396bc4bd48e84a3ee4e1bcd252f1ee75b850d Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 8 May 2020 17:27:03 -0700 Subject: [PATCH 41/43] -- forgot cleanup --- setup.bat | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.bat b/setup.bat index 5dc406239..248c3322e 100644 --- a/setup.bat +++ b/setup.bat @@ -17,6 +17,7 @@ if not exist vocoder/saved_models ( python -m pip install gdown gdown https://drive.google.com/uc?id=1n1sPXvT34yXFLT47QZA6FIRGrwMeSsZc python -c "import zipfile; zipfile.ZipFile('pretrained.zip').extractall()" + del pretrained.zip ) start cmd /k "%userprofile%/miniconda3/Scripts/activate base & conda install -y pytorch & exit" From e446ca8247161bad40b6a229c82af91fed8b3391 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Fri, 8 May 2020 18:23:17 -0700 Subject: [PATCH 42/43] -- update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index e3f1b81bc..944558088 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,5 @@ LibriSpeech/* *.txt *.TXT *.flac +*.mp3 +*.zip From 1c375832d67a595a9c1a290f09706e71a0ee35f0 Mon Sep 17 00:00:00 2001 From: pusalieth Date: Sun, 10 May 2020 18:45:25 -0700 Subject: [PATCH 43/43] -- formatting -- terminal startup -- automatic split device detection --- .vscode/settings.json | 6 +- synthesizer/models/modules.py | 6 +- synthesizer/models/tacotron.py | 8 +- synthesizer/train.py | 224 ++++++++++++++++++--------------- synthesizer_train.py | 34 ++--- 5 files changed, 149 insertions(+), 129 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index dd7086174..58738f6f2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,7 @@ { - "python.pythonPath": "%userprofile%/miniconda3/AppData\\Local\\Programs\\Python\\Python37\\python.exe", "terminal.integrated.shell.windows": "C:\\Windows\\System32\\cmd.exe", - "terminal.integrated.shellArgs.windows": ["/k", "%userprofile%/miniconda3/Scripts/activate base"] + "terminal.integrated.shellArgs.windows": [ + "/k", + "%userprofile%/miniconda3/Scripts/activate base" + ] } \ No newline at end of file diff --git a/synthesizer/models/modules.py b/synthesizer/models/modules.py index 6ea9f1a10..f9fe7ebe9 100644 --- a/synthesizer/models/modules.py +++ b/synthesizer/models/modules.py @@ -150,7 +150,6 @@ def __call__(self, inputs, state, scope=None): # probability to mask activations)! c = (1 - self._zoneout_cell) * tf.nn.dropout(new_c - prev_c, (1 - self._zoneout_cell)) + prev_c h = (1 - self._zoneout_outputs) * tf.nn.dropout(new_h - prev_h, (1 - self._zoneout_outputs)) + prev_h - else: c = (1 - self._zoneout_cell) * new_c + self._zoneout_cell * prev_c h = (1 - self._zoneout_outputs) * new_h + self._zoneout_outputs * prev_h @@ -177,7 +176,7 @@ def __init__(self, is_training, hparams, activation=tf.nn.relu, scope=None): """ super(EncoderConvolutions, self).__init__() self.is_training = is_training - + self.kernel_size = hparams.enc_conv_kernel_size self.channels = hparams.enc_conv_channels self.activation = activation @@ -241,7 +240,8 @@ def __call__(self, inputs, input_lengths): class Prenet: - """Two fully connected layers used as an information bottleneck for the attention. + """ + Two fully connected layers used as an information bottleneck for the attention. """ def __init__(self, is_training, layers_sizes=[256, 256], drop_rate=0.5, activation=tf.nn.relu, diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py index c3bcdd0f8..4b952c100 100644 --- a/synthesizer/models/tacotron.py +++ b/synthesizer/models/tacotron.py @@ -65,9 +65,7 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, raise RuntimeError( "Model can not be in training and evaluation modes at the same time!") - split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \ - self._hparams.split_on_cpu else "/gpu:{}".format( - self._hparams.tacotron_gpu_start_idx) + split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else "/gpu:{}".format(self._hparams.tacotron_gpu_start_idx) with tf.device(split_device): hp = self._hparams lout_int = [tf.int32] * hp.tacotron_num_gpus @@ -80,12 +78,10 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, targets_lengths is not None else targets_lengths ### SV2TTS ### - tower_embed_targets = tf.split(embed_targets, num_or_size_splits=hp.tacotron_num_gpus, axis=0) ############## - p_inputs = tf.numpy_function( split_func, [inputs, split_infos[:, 0]], lout_int) p_mel_targets = tf.numpy_function(split_func, [mel_targets, split_infos[:, 1]], @@ -124,7 +120,7 @@ def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None, gpus = ["/gpu:{}".format(i) for i in range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)] for i in range(hp.tacotron_num_gpus): - with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", + with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device=split_device, worker_device=gpus[i])): with tf.compat.v1.variable_scope("inference") as scope: assert hp.tacotron_teacher_forcing_mode in ( diff --git a/synthesizer/train.py b/synthesizer/train.py index 846adba68..16d99f4a4 100644 --- a/synthesizer/train.py +++ b/synthesizer/train.py @@ -20,42 +20,51 @@ def add_embedding_stats(summary_writer, embedding_names, paths_to_meta, checkpoi # Create tensorboard projector config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig() config.model_checkpoint_path = checkpoint_path - + for embedding_name, path_to_meta in zip(embedding_names, paths_to_meta): # Initialize config embedding = config.embeddings.add() # Specifiy the embedding variable and the metadata embedding.tensor_name = embedding_name embedding.metadata_path = path_to_meta - + # Project the embeddings to space dimensions for visualization - tf.contrib.tensorboard.plugins.projector.visualize_embeddings(summary_writer, config) + tf.contrib.tensorboard.plugins.projector.visualize_embeddings( + summary_writer, config) def add_train_stats(model, hparams): with tf.compat.v1.variable_scope("stats") as scope: for i in range(hparams.tacotron_num_gpus): - tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_mel_outputs[i]) - tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_mel_targets[i]) + tf.compat.v1.summary.histogram( + "mel_outputs %d" % i, model.tower_mel_outputs[i]) + tf.compat.v1.summary.histogram( + "mel_targets %d" % i, model.tower_mel_targets[i]) tf.compat.v1.summary.scalar("before_loss", model.before_loss) tf.compat.v1.summary.scalar("after_loss", model.after_loss) - + if hparams.predict_linear: tf.compat.v1.summary.scalar("linear_loss", model.linear_loss) for i in range(hparams.tacotron_num_gpus): - tf.compat.v1.summary.histogram("mel_outputs %d" % i, model.tower_linear_outputs[i]) - tf.compat.v1.summary.histogram("mel_targets %d" % i, model.tower_linear_targets[i]) - - tf.compat.v1.summary.scalar("regularization_loss", model.regularization_loss) + tf.compat.v1.summary.histogram( + "mel_outputs %d" % i, model.tower_linear_outputs[i]) + tf.compat.v1.summary.histogram( + "mel_targets %d" % i, model.tower_linear_targets[i]) + + tf.compat.v1.summary.scalar( + "regularization_loss", model.regularization_loss) tf.compat.v1.summary.scalar("stop_token_loss", model.stop_token_loss) tf.compat.v1.summary.scalar("loss", model.loss) - tf.compat.v1.summary.scalar("learning_rate", model.learning_rate) # Control learning rate decay speed + # Control learning rate decay speed + tf.compat.v1.summary.scalar("learning_rate", model.learning_rate) if hparams.tacotron_teacher_forcing_mode == "scheduled": - tf.compat.v1.summary.scalar("teacher_forcing_ratio", model.ratio) # Control teacher forcing + # Control teacher forcing + tf.compat.v1.summary.scalar("teacher_forcing_ratio", model.ratio) # ratio decay when mode = "scheduled" gradient_norms = [tf.norm(tensor=grad) for grad in model.gradients] tf.compat.v1.summary.histogram("gradient_norm", gradient_norms) - tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max(input_tensor=gradient_norms)) # visualize + tf.compat.v1.summary.scalar("max_gradient_norm", tf.reduce_max( + input_tensor=gradient_norms)) # visualize # gradients (in case of explosion) return tf.compat.v1.summary.merge_all() @@ -63,17 +72,18 @@ def add_train_stats(model, hparams): def add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, loss): values = [ - tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_before_loss", - simple_value=before_loss), - tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_after_loss", - simple_value=after_loss), - tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/stop_token_loss", - simple_value=stop_token_loss), - tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss), + tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/eval_before_loss", simple_value=before_loss), + tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/eval_after_loss", simple_value=after_loss), + tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/stop_token_loss", simple_value=stop_token_loss), + tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/eval_loss", simple_value=loss), ] if linear_loss is not None: - values.append(tf.compat.v1.Summary.Value(tag="Tacotron_eval_model/eval_stats/eval_linear_loss", - simple_value=linear_loss)) + values.append(tf.compat.v1.Summary.Value( + tag="Tacotron_eval_model/eval_stats/eval_linear_loss", simple_value=linear_loss)) test_summary = tf.compat.v1.Summary(value=values) summary_writer.add_summary(test_summary, step) @@ -85,7 +95,7 @@ def time_string(): def model_train_mode(args, feeder, hparams, global_step): with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope: model = create_model("Tacotron", hparams) - model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, + model.initialize(feeder.inputs, feeder.input_lengths, feeder.speaker_embeddings, feeder.mel_targets, feeder.token_targets, targets_lengths=feeder.targets_lengths, global_step=global_step, is_training=True, split_infos=feeder.split_infos) @@ -98,9 +108,9 @@ def model_train_mode(args, feeder, hparams, global_step): def model_test_mode(args, feeder, hparams, global_step): with tf.compat.v1.variable_scope("Tacotron_model", reuse=tf.compat.v1.AUTO_REUSE) as scope: model = create_model("Tacotron", hparams) - model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, + model.initialize(feeder.eval_inputs, feeder.eval_input_lengths, feeder.eval_speaker_embeddings, feeder.eval_mel_targets, - feeder.eval_token_targets, targets_lengths=feeder.eval_targets_lengths, + feeder.eval_token_targets, targets_lengths=feeder.eval_targets_lengths, global_step=global_step, is_training=False, is_evaluating=True, split_infos=feeder.eval_split_infos) model.add_loss() @@ -126,28 +136,28 @@ def train(log_dir, args, hparams): os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) - + checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt") metadat_fpath = os.path.join(args.synthesizer_root, "train.txt") - + log("Checkpoint path: {}".format(checkpoint_fpath)) log("Loading training data from: {}".format(metadat_fpath)) log("Using model: Tacotron") log(hparams_debug_string()) - + # Start by setting a seed for repeatability tf.compat.v1.set_random_seed(hparams.tacotron_random_seed) - + # Set up data feeder coord = tf.train.Coordinator() with tf.compat.v1.variable_scope("datafeeder") as scope: feeder = Feeder(coord, metadat_fpath, hparams) - + # Set up model: global_step = tf.Variable(0, name="global_step", trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) - + # Embeddings metadata char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv") if not os.path.isfile(char_embedding_meta): @@ -155,143 +165,151 @@ def train(log_dir, args, hparams): for symbol in symbols: if symbol == " ": symbol = "\\s" # For visual purposes, swap space with \s - + f.write("{}\n".format(symbol)) - + char_embedding_meta = char_embedding_meta.replace(log_dir, "..") - + # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.compat.v1.train.Saver(max_to_keep=5) - - log("Tacotron training set to a maximum of {} steps".format(args.tacotron_train_steps)) - + + log("Tacotron training set to a maximum of {} steps".format( + args.tacotron_train_steps)) + # Memory allocation on the GPU as needed config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True + #config.gpu_options.per_process_gpu_memory_fraction = 0.4 config.allow_soft_placement = True - + # Train with tf.compat.v1.Session(config=config) as sess: try: - summary_writer = tf.compat.v1.summary.FileWriter(tensorboard_dir, sess.graph) - + summary_writer = tf.compat.v1.summary.FileWriter( + tensorboard_dir, sess.graph) sess.run(tf.compat.v1.global_variables_initializer()) - + # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) - + if checkpoint_state and checkpoint_state.model_checkpoint_path: log("Loading checkpoint {}".format(checkpoint_state.model_checkpoint_path), slack=True) - saver.restore(sess, checkpoint_state.model_checkpoint_path) - + saver.restore( + sess, checkpoint_state.model_checkpoint_path) + else: log("No model to load at {}".format(save_dir), slack=True) - saver.save(sess, checkpoint_fpath, global_step=global_step) - + saver.save(sess, checkpoint_fpath, + global_step=global_step) + except tf.errors.OutOfRangeError as e: log("Cannot restore checkpoint: {}".format(e), slack=True) else: log("Starting new training!", slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) - + # initializing feeder feeder.start_threads(sess) - + # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() - step, loss, opt = sess.run([global_step, model.loss, model.optimize]) + step, loss, opt = sess.run( + [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format( step, time_window.average, loss, loss_window.average) - log(message, end="\r", slack=(step % args.checkpoint_interval == 0)) + log(message, end="\r", slack=(step % + args.checkpoint_interval == 0)) print(message) - + if loss > 100 or np.isnan(loss): - log("Loss exploded to {:.5f} at step {}".format(loss, step)) + log("Loss exploded to {:.5f} at step {}".format( + loss, step)) raise Exception("Loss exploded") - + if step % args.summary_interval == 0: log("\nWriting summary at step {}".format(step)) summary_writer.add_summary(sess.run(stats), step) - + if step % args.eval_interval == 0: # Run eval and save eval stats log("\nRunning evaluation at step {}".format(step)) - + eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None - + if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \ - mel_t, t_len, align, lin_p, lin_t = sess.run( - [ - eval_model.tower_loss[0], eval_model.tower_before_loss[0], - eval_model.tower_after_loss[0], - eval_model.tower_stop_token_loss[0], - eval_model.tower_linear_loss[0], - eval_model.tower_mel_outputs[0][0], - eval_model.tower_mel_targets[0][0], - eval_model.tower_targets_lengths[0][0], - eval_model.tower_alignments[0][0], - eval_model.tower_linear_outputs[0][0], - eval_model.tower_linear_targets[0][0], - ]) + mel_t, t_len, align, lin_p, lin_t = sess.run( + [ + eval_model.tower_loss[0], eval_model.tower_before_loss[0], + eval_model.tower_after_loss[0], + eval_model.tower_stop_token_loss[0], + eval_model.tower_linear_loss[0], + eval_model.tower_mel_outputs[0][0], + eval_model.tower_mel_targets[0][0], + eval_model.tower_targets_lengths[0][0], + eval_model.tower_alignments[0][0], + eval_model.tower_linear_outputs[0][0], + eval_model.tower_linear_targets[0][0], + ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) - + wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, "step-{}-eval-wave-from-linear.wav".format( step)), sr=hparams.sample_rate) - + else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len,\ - align = sess.run( - [ - eval_model.tower_loss[0], eval_model.tower_before_loss[0], - eval_model.tower_after_loss[0], - eval_model.tower_stop_token_loss[0], - eval_model.tower_mel_outputs[0][0], - eval_model.tower_mel_targets[0][0], - eval_model.tower_targets_lengths[0][0], - eval_model.tower_alignments[0][0] - ]) + align = sess.run( + [ + eval_model.tower_loss[0], eval_model.tower_before_loss[0], + eval_model.tower_after_loss[0], + eval_model.tower_stop_token_loss[0], + eval_model.tower_mel_outputs[0][0], + eval_model.tower_mel_targets[0][0], + eval_model.tower_targets_lengths[0][0], + eval_model.tower_alignments[0][0] + ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) - + eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) - stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) - + stop_token_loss = sum( + stop_token_losses) / len(stop_token_losses) + log("Saving eval log to {}..".format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, "step-{}-eval-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) - + plot.plot_alignment(align, os.path.join(eval_plot_dir, "step-{}-eval-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format("Tacotron", @@ -301,7 +319,7 @@ def train(log_dir, args, hparams): max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, "step-{" - "}-eval-mel-spectrogram.png".format( + "}-eval-mel-spectrogram.png".format( step)), title="{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), @@ -309,7 +327,7 @@ def train(log_dir, args, hparams): eval_loss), target_spectrogram=mel_t, max_len=t_len) - + if hparams.predict_linear: plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, "step-{}-eval-linear-spectrogram.png".format( @@ -318,17 +336,16 @@ def train(log_dir, args, hparams): "Tacotron", time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) - + log("Eval loss for global step {}: {:.3f}".format(step, eval_loss)) log("Writing eval summary!") add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss) - - if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \ - step % 50 == 0: + + if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps: # Save model and current global step saver.save(sess, checkpoint_fpath, global_step=global_step) - + log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..") input_seq, mel_prediction, alignment, target, target_length = sess.run([ model.tower_inputs[0][0], @@ -337,21 +354,23 @@ def train(log_dir, args, hparams): model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) - + # save predicted mel spectrogram to disk (debug) mel_filename = "mel-prediction-step-{}.npy".format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) - + # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, - os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)), + os.path.join( + wav_dir, "step-{}-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) - + # save alignment plot to disk (control purposes) plot.plot_alignment(alignment, - os.path.join(plot_dir, "step-{}-align.png".format(step)), + os.path.join( + plot_dir, "step-{}-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), step, loss), @@ -365,23 +384,24 @@ def train(log_dir, args, hparams): step, loss), target_spectrogram=target, max_len=target_length) - log("Input at step {}: {}".format(step, sequence_to_text(input_seq))) - + log("Input at step {}: {}".format( + step, sequence_to_text(input_seq))) + if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: # Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) - + # Update Projector log("\nSaving Model Character Embeddings visualization..") add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path) log("Tacotron Character embeddings have been updated on tensorboard!") - + log("Tacotron training complete after {} global steps!".format( args.tacotron_train_steps), slack=True) return save_dir - + except Exception as e: log("Exiting due to exception: {}".format(e), slack=True) traceback.print_exc() diff --git a/synthesizer_train.py b/synthesizer_train.py index 5c46d1af6..43dae3924 100644 --- a/synthesizer_train.py +++ b/synthesizer_train.py @@ -12,44 +12,46 @@ def prepare_run(args): run_name = args.name log_dir = os.path.join(args.models_dir, "logs-{}".format(run_name)) os.makedirs(log_dir, exist_ok=True) - infolog.init(os.path.join(log_dir, "Terminal_train_log"), run_name, args.slack_url) + infolog.init(os.path.join(log_dir, "Terminal_train_log"), + run_name, args.slack_url) return log_dir, modified_hp if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("name", help="Name of the run and of the logging directory.") - parser.add_argument('-d', "--synthesizer_root", type=str, default='./datasets/SV2TTS/synthesizer/', help=\ - "Path to the synthesizer training data that contains the audios and the train.txt file. " - "If you let everything as default, it should be /SV2TTS/synthesizer/.") - parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\ - "Path to the output directory that will contain the saved model weights and the logs.") + parser.add_argument( + "name", help="Name of the run and of the logging directory.") + parser.add_argument('-d', "--synthesizer_root", type=str, default='./datasets/SV2TTS/synthesizer/', + help="Path to the synthesizer training data that contains the audios and the train.txt file. " + "If you let everything as default, it should be /SV2TTS/synthesizer/.") + parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", + help="Path to the output directory that will contain the saved model weights and the logs.") parser.add_argument("--mode", default="synthesis", help="mode for synthesis of tacotron after training") parser.add_argument("--GTA", default="True", - help="Ground truth aligned synthesis, defaults to True, only considered " - "in Tacotron synthesis mode") + help="Ground truth aligned synthesis, defaults to True, only considered in Tacotron synthesis mode") parser.add_argument("--restore", type=bool, default=True, help="Set this to False to do a fresh training") parser.add_argument("--summary_interval", type=int, default=2500, help="Steps between running summary ops") parser.add_argument("--embedding_interval", type=int, default=10000, help="Steps between updating embeddings projection visualization") - parser.add_argument("--checkpoint_interval", type=int, default=2000, # Was 5000 + parser.add_argument("--checkpoint_interval", type=int, default=2000, # Was 5000 help="Steps between writing checkpoints") - parser.add_argument("--eval_interval", type=int, default=100000, # Was 10000 + parser.add_argument("--eval_interval", type=int, default=100000, # Was 10000 help="Steps between eval on test data") - parser.add_argument("--tacotron_train_steps", type=int, default=2000000, # Was 100000 + parser.add_argument("--tacotron_train_steps", type=int, default=2000000, # Was 100000 help="total number of tacotron training steps") - parser.add_argument("--tf_log_level", type=int, default=1, help="Tensorflow C++ log level.") + parser.add_argument("--tf_log_level", type=int, + default=1, help="Tensorflow C++ log level.") parser.add_argument("--slack_url", default=None, help="slack webhook notification destination link") parser.add_argument("--hparams", default="", help="Hyperparameter overrides as a comma-separated list of name=value " - "pairs") + "pairs") args = parser.parse_args() print_args(args, parser) - + log_dir, hparams = prepare_run(args) - + tacotron_train(args, log_dir, hparams)