Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/dev' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
lexkoro committed Jul 13, 2020
2 parents 30195a3 + 98becfe commit c4a0f4d
Show file tree
Hide file tree
Showing 16 changed files with 243 additions and 71 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@

<img src="https://travis-ci.org/mozilla/TTS.svg?branch=dev"/>

This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). TTS aims a deep learning based Text2Speech engine, low in cost and high in quality. To begin with, you can hear a sample synthesized voice from [here](https://soundcloud.com/user-565970875/commonvoice-loc-sens-attn).
This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). TTS aims a deep learning based Text2Speech engine, low in cost and high in quality.

If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons.
You can check some of synthesized voice samples from [here](https://erogol.github.io/ddc-samples/).

If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about some of TTS architectures and [here](https://github.com/erogol/TTS-papers) list of up-to-date research papers.

[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/0)](https://sourcerer.io/fame/erogol/erogol/TTS/links/0)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/1)](https://sourcerer.io/fame/erogol/erogol/TTS/links/1)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/2)](https://sourcerer.io/fame/erogol/erogol/TTS/links/2)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/3)](https://sourcerer.io/fame/erogol/erogol/TTS/links/3)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/4)](https://sourcerer.io/fame/erogol/erogol/TTS/links/4)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/5)](https://sourcerer.io/fame/erogol/erogol/TTS/links/5)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/6)](https://sourcerer.io/fame/erogol/erogol/TTS/links/6)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/7)](https://sourcerer.io/fame/erogol/erogol/TTS/links/7)

Expand Down
21 changes: 13 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
numpy>=1.16.0
torch>=1.5
librosa>=0.5.1
Unidecode>=0.4.20
tensorboard
tensorflow>=2.2
numpy>=1.16.0
scipy>=0.19.0
numba==0.48
librosa==0.7.2
phonemizer>=2.2.0
unidecode==0.4.20
attrdict
tensorboardX
matplotlib
Pillow
flask
scipy
tqdm
soundfile
phonemizer
bokeh==1.4.0
inflect
bokeh==1.4.0
soundfile
nose==1.3.7
cardboardlint==1.3.0
pylint==2.5.3
19 changes: 10 additions & 9 deletions requirements_tests.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
torch>=1.5
tensorflow==2.3rc
numpy>=1.16.0
scipy>=0.19.0
numba==0.48
torch>=0.4.1
tensorflow>=2.2
librosa>=0.5.1
Unidecode>=0.4.20
tensorboard
librosa==0.7.2
phonemizer>=2.2.0
unidecode==0.4.20
attrdict
tensorboardX
matplotlib
Pillow
flask
scipy
tqdm
soundfile
inflect
phonemizer
bokeh==1.4.0
nose
soundfile
nose==1.3.7
cardboardlint==1.3.0
26 changes: 8 additions & 18 deletions server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,9 @@ def convert_boolean(x):
parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.')
parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.')
parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.')
parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
Expand All @@ -46,30 +43,23 @@ def convert_boolean(x):
wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')

embedded_pwgan_folder = os.path.join(embedded_models_folder, 'pwgan')
pwgan_checkpoint_file = os.path.join(embedded_pwgan_folder, 'checkpoint.pkl')
pwgan_config_file = os.path.join(embedded_pwgan_folder, 'config.yml')

args = create_argparser().parse_args()

# If these were not specified in the CLI args, use default values with embedded model files
if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
args.tts_checkpoint = tts_checkpoint_file
if not args.tts_config and os.path.isfile(tts_config_file):
args.tts_config = tts_config_file
if not args.vocoder_checkpoint and os.path.isfile(tts_checkpoint_file):
args.tts_checkpoint = tts_checkpoint_file
if not args.vocoder_config and os.path.isfile(tts_config_file):
args.tts_config = tts_config_file

if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file):
args.wavernn_file = wavernn_checkpoint_file
if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
args.vocoder_checkpoint = vocoder_checkpoint_file
if not args.vocoder_config and os.path.isfile(vocoder_config_file):
args.vocoder_config = vocoder_config_file

if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file):
args.wavernn_checkpoint = wavernn_checkpoint_file
if not args.wavernn_config and os.path.isfile(wavernn_config_file):
args.wavernn_config = wavernn_config_file
if not args.pwgan_file and os.path.isfile(pwgan_checkpoint_file):
args.pwgan_file = pwgan_checkpoint_file
if not args.pwgan_config and os.path.isfile(pwgan_config_file):
args.pwgan_config = pwgan_config_file

synthesizer = Synthesizer(args)

Expand Down
5 changes: 3 additions & 2 deletions server/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,16 @@ def __init__(self, config):
self.wavernn = None
self.vocoder_model = None
self.config = config
print(config)
self.use_cuda = self.config.use_cuda
if self.use_cuda:
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
self.config.use_cuda)
if self.config.vocoder_file:
if self.config.vocoder_checkpoint:
self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
if self.config.wavernn_lib_path:
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file,
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
self.config.wavernn_config, self.config.use_cuda)

def load_tts(self, tts_checkpoint, tts_config, use_cuda):
Expand Down
72 changes: 53 additions & 19 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,41 @@ def run(self):
shutil.copy(args.model_config, embedded_config_path)
package_data.extend([embedded_checkpoint_path, embedded_config_path])


def pip_install(package_name):
subprocess.call(
[sys.executable, '-m', 'pip', 'install', package_name]
)


requirements = {
'install_requires':[
"torch>=1.5",
"numpy>=1.16.0",
"numba==0.48",
"scipy>=0.19.0",
"librosa==0.7.2",
"unidecode==0.4.20",
"attrdict",
"tensorboardX",
"matplotlib",
"Pillow",
"flask",
"tqdm",
"inflect",
"bokeh==1.4.0",
"soundfile",
"phonemizer>=2.2.0",
"nose==1.3.7",
"cardboardlint==1.3.0",
"pylint==2.5.3",
],
'pip_install':[
'tensorflow>=2.2.0',
]
}


setup(
name='TTS',
version=version,
Expand All @@ -95,24 +130,23 @@ def run(self):
'build_py': build_py,
'develop': develop,
},
install_requires=[
"scipy>=0.19.0",
"torch>=1.5",
"numpy>=1.16.0",
"librosa==0.6.2",
"unidecode==0.4.20",
"attrdict",
"tensorboardX",
"matplotlib",
"Pillow",
"flask",
"tqdm",
"inflect",
"bokeh==1.4.0",
"soundfile",
"phonemizer @ https://github.com/bootphon/phonemizer/tarball/master",
],
dependency_links=[
"https://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer-1.0.1"
install_requires=requirements['install_requires'],
python_requires='>=3.6.0',
classifiers=[
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
'Development Status :: 3 - Alpha',
"Intended Audience :: Science/Research :: Developers",
"Operating System :: POSIX :: Linux",
'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
"Topic :: Software Development :: Libraries :: Python Modules :: Speech :: Sound/Audio :: Multimedia :: Artificial Intelligence",
]
)

# for some reason having tensorflow in 'install_requires'
# breaks some of the dependencies.
for module in requirements['pip_install']:
pip_install(module)
Binary file modified tests/inputs/scale_stats.npy
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/inputs/server_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"wavernn_file": null, // wavernn checkpoint file name
"wavernn_config": null, // wavernn config file
"vocoder_config":null,
"vocoder_file": null,
"vocoder_checkpoint": null,
"is_wavernn_batched":true,
"port": 5002,
"use_cuda": false,
Expand Down
3 changes: 3 additions & 0 deletions tests/test_server_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ rm -f dist/*.whl
python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
pip install --quiet dist/TTS*.whl

# this is related to https://github.com/librosa/librosa/issues/1160
pip install numba==0.48

python -m TTS.server.server &
SERVER_PID=$!

Expand Down
Empty file added tf/__init__.py
Empty file.
1 change: 1 addition & 0 deletions tf/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Empty file added tf/tests/test_layers_tf.py
Empty file.
134 changes: 134 additions & 0 deletions tf/tests/test_tacotron2_tf_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import os
import torch
import unittest
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('INFO')

from TTS.utils.io import load_config
from TTS.tf.models.tacotron2 import Tacotron2
from TTS.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model

#pylint: disable=unused-variable

torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

file_path = os.path.dirname(os.path.realpath(__file__)).replace('/tf/', '/')
c = load_config(os.path.join(file_path, 'test_config.json'))


class TacotronTFTrainTest(unittest.TestCase):

@staticmethod
def generate_dummy_inputs():
chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)

chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
stop_targets, speaker_ids

def test_train_step(self):
''' test forward pass '''
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
stop_targets, speaker_ids = self.generate_dummy_inputs()

for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0

stop_targets = stop_targets.view(chars_seq.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()

model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
# training pass
output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)

# check model output shapes
assert np.all(output[0].shape == mel_spec.shape)
assert np.all(output[1].shape == mel_spec.shape)
assert output[2].shape[2] == chars_seq.shape[1]
assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)

# inference pass
output = model(chars_seq, training=False)

def test_forward_attention(self,):
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
stop_targets, speaker_ids = self.generate_dummy_inputs()

for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0

stop_targets = stop_targets.view(chars_seq.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()

model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, forward_attn=True)
# training pass
output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)

# check model output shapes
assert np.all(output[0].shape == mel_spec.shape)
assert np.all(output[1].shape == mel_spec.shape)
assert output[2].shape[2] == chars_seq.shape[1]
assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)

# inference pass
output = model(chars_seq, training=False)

def test_tflite_conversion(self, ): #pylint:disable=no-self-use
model = Tacotron2(num_chars=24,
num_speakers=0,
r=3,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm='sigmoid',
prenet_type='original',
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=0,
separate_stopnet=True,
bidirectional_decoder=False,
enable_tflite=True)
model.build_inference()
convert_tacotron2_to_tflite(model, output_path='test_tacotron2.tflite', experimental_converter=True)
# init tflite model
tflite_model = load_tflite_model('test_tacotron2.tflite')
# fake input
inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32) #pylint:disable=unexpected-keyword-arg
# run inference
# get input and output details
input_details = tflite_model.get_input_details()
output_details = tflite_model.get_output_details()
# reshape input tensor for the new input shape
tflite_model.resize_tensor_input(input_details[0]['index'], inputs.shape) #pylint:disable=unexpected-keyword-arg
tflite_model.allocate_tensors()
detail = input_details[0]
input_shape = detail['shape']
tflite_model.set_tensor(detail['index'], inputs)
# run the tflite_model
tflite_model.invoke()
# collect outputs
decoder_output = tflite_model.get_tensor(output_details[0]['index'])
postnet_output = tflite_model.get_tensor(output_details[1]['index'])
# remove tflite binary
os.remove('test_tacotron2.tflite')

Loading

0 comments on commit c4a0f4d

Please sign in to comment.