|
1 |
| -from synthesizer.tacotron2 import Tacotron2 |
2 |
| -from synthesizer.hparams import hparams as default_hparams |
3 |
| -# from multiprocess.pool import Pool # You're free to use either one |
4 |
| -from multiprocessing import Pool # |
5 |
| -from synthesizer.utils import audio |
6 |
| -from pathlib import Path |
7 |
| -from typing import Union, List |
8 |
| -import tensorflow as tf |
9 |
| -import numpy as np |
10 |
| -import numba.cuda |
11 |
| -import librosa |
12 |
| - |
13 |
| - |
14 |
| -class Synthesizer: |
15 |
| - sample_rate = default_hparams.sample_rate |
16 |
| - hparams = default_hparams |
17 |
| - |
18 |
| - def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False, hparams=None): |
19 |
| - """ |
20 |
| - Creates a synthesizer ready for inference. The actual model isn't loaded in memory until |
21 |
| - needed or until load() is called. |
22 |
| - |
23 |
| - :param checkpoints_dir: path to the directory containing the checkpoint file as well as the |
24 |
| - weight files (.data, .index and .meta files) |
25 |
| - :param verbose: if False, only tensorflow's output will be printed TODO: suppress them too |
26 |
| - :param low_mem: if True, the model will be loaded in a separate process and its resources |
27 |
| - will be released after each usage. Adds a large overhead, only recommended if your GPU |
28 |
| - memory is low (<= 2gb) |
29 |
| - """ |
30 |
| - self.hparams = hparams or default_hparams |
31 |
| - self.sample_rate = self.hparams.sample_rate |
32 |
| - |
33 |
| - self.verbose = verbose |
34 |
| - self._low_mem = low_mem |
35 |
| - |
36 |
| - # Prepare the model |
37 |
| - self._model = None # type: Tacotron2 |
38 |
| - checkpoint_state = tf.train.get_checkpoint_state(str(checkpoints_dir)) |
39 |
| - if checkpoint_state is None: |
40 |
| - raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir) |
41 |
| - self.checkpoint_fpath = checkpoint_state.model_checkpoint_path |
42 |
| - if verbose: |
43 |
| - model_name = checkpoints_dir.parent.name.replace("logs-", "") |
44 |
| - step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:]) |
45 |
| - print("Found synthesizer \"%s\" trained to step %d" % (model_name, step)) |
46 |
| - |
47 |
| - def is_loaded(self): |
48 |
| - """ |
49 |
| - Whether the model is loaded in GPU memory. |
50 |
| - """ |
51 |
| - return self._model is not None |
52 |
| - |
53 |
| - def load(self): |
54 |
| - """ |
55 |
| - Effectively loads the model to GPU memory given the weights file that was passed in the |
56 |
| - constructor. |
57 |
| - """ |
58 |
| - if self._low_mem: |
59 |
| - raise Exception("Cannot load the synthesizer permanently in low mem mode") |
60 |
| - tf.reset_default_graph() |
61 |
| - self._model = Tacotron2(self.checkpoint_fpath, self.hparams) |
62 |
| - |
63 |
| - def synthesize_spectrograms(self, texts: List[str], |
64 |
| - embeddings: Union[np.ndarray, List[np.ndarray]], |
65 |
| - return_alignments=False): |
66 |
| - """ |
67 |
| - Synthesizes mel spectrograms from texts and speaker embeddings. |
68 |
| -
|
69 |
| - :param texts: a list of N text prompts to be synthesized |
70 |
| - :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) |
71 |
| - :param return_alignments: if True, a matrix representing the alignments between the |
72 |
| - characters |
73 |
| - and each decoder output step will be returned for each spectrogram |
74 |
| - :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the |
75 |
| - sequence length of spectrogram i, and possibly the alignments. |
76 |
| - """ |
77 |
| - if not self._low_mem: |
78 |
| - # Usual inference mode: load the model on the first request and keep it loaded. |
79 |
| - if not self.is_loaded(): |
80 |
| - self.load() |
81 |
| - specs, alignments = self._model.my_synthesize(embeddings, texts) |
82 |
| - else: |
83 |
| - # Low memory inference mode: load the model upon every request. The model has to be |
84 |
| - # loaded in a separate process to be able to release GPU memory (a simple workaround |
85 |
| - # to tensorflow's intricacies) |
86 |
| - specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms, |
87 |
| - [(self.checkpoint_fpath, embeddings, texts)])[0] |
88 |
| - |
89 |
| - return (specs, alignments) if return_alignments else specs |
90 |
| - |
91 |
| - @staticmethod |
92 |
| - def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts, hparams=None): |
93 |
| - # Load the model and forward the inputs |
94 |
| - hparams = hparams or default_hparams |
95 |
| - tf.reset_default_graph() |
96 |
| - model = Tacotron2(checkpoint_fpath, hparams) |
97 |
| - specs, alignments = model.my_synthesize(embeddings, texts) |
98 |
| - |
99 |
| - # Detach the outputs (not doing so will cause the process to hang) |
100 |
| - specs, alignments = [spec.copy() for spec in specs], alignments.copy() |
101 |
| - |
102 |
| - # Close cuda for this process |
103 |
| - model.session.close() |
104 |
| - numba.cuda.select_device(0) |
105 |
| - numba.cuda.close() |
106 |
| - |
107 |
| - return specs, alignments |
108 |
| - |
109 |
| - @staticmethod |
110 |
| - def load_preprocess_wav(fpath, hparams=None): |
111 |
| - """ |
112 |
| - Loads and preprocesses an audio file under the same conditions the audio files were used to |
113 |
| - train the synthesizer. |
114 |
| - """ |
115 |
| - hparams = hparams or default_hparams |
116 |
| - wav = librosa.load(fpath, hparams.sample_rate)[0] |
117 |
| - if hparams.rescale: |
118 |
| - wav = wav / np.abs(wav).max() * hparams.rescaling_max |
119 |
| - return wav |
120 |
| - |
121 |
| - @staticmethod |
122 |
| - def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray], hparams=None): |
123 |
| - """ |
124 |
| - Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that |
125 |
| - were fed to the synthesizer when training. |
126 |
| - """ |
127 |
| - hparams = hparams or default_hparams |
128 |
| - if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): |
129 |
| - wav = Synthesizer.load_preprocess_wav(fpath_or_wav) |
130 |
| - else: |
131 |
| - wav = fpath_or_wav |
132 |
| - |
133 |
| - mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) |
134 |
| - return mel_spectrogram |
135 |
| - |
136 |
| - @staticmethod |
137 |
| - def griffin_lim(mel, hparams=None): |
138 |
| - """ |
139 |
| - Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built |
140 |
| - with the same parameters present in hparams.py. |
141 |
| - """ |
142 |
| - hparams = hparams or default_hparams |
143 |
| - return audio.inv_melspectrogram(mel, hparams) |
| 1 | +from synthesizer.tacotron2 import Tacotron2 |
| 2 | +from synthesizer.hparams import hparams as default_hparams |
| 3 | +# from multiprocess.pool import Pool # You're free to use either one |
| 4 | +from multiprocessing import Pool # |
| 5 | +from synthesizer.utils import audio |
| 6 | +from pathlib import Path |
| 7 | +from typing import Union, List |
| 8 | +import tensorflow as tf |
| 9 | +import numpy as np |
| 10 | +import numba.cuda |
| 11 | +import librosa |
| 12 | + |
| 13 | + |
| 14 | +class Synthesizer: |
| 15 | + sample_rate = default_hparams.sample_rate |
| 16 | + hparams = default_hparams |
| 17 | + |
| 18 | + def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False, hparams=None): |
| 19 | + """ |
| 20 | + Creates a synthesizer ready for inference. The actual model isn't loaded in memory until |
| 21 | + needed or until load() is called. |
| 22 | + |
| 23 | + :param checkpoints_dir: path to the directory containing the checkpoint file as well as the |
| 24 | + weight files (.data, .index and .meta files) |
| 25 | + :param verbose: if False, only tensorflow's output will be printed TODO: suppress them too |
| 26 | + :param low_mem: if True, the model will be loaded in a separate process and its resources |
| 27 | + will be released after each usage. Adds a large overhead, only recommended if your GPU |
| 28 | + memory is low (<= 2gb) |
| 29 | + """ |
| 30 | + self.hparams = hparams or default_hparams |
| 31 | + self.sample_rate = self.hparams.sample_rate |
| 32 | + |
| 33 | + self.verbose = verbose |
| 34 | + self._low_mem = low_mem |
| 35 | + |
| 36 | + # Prepare the model |
| 37 | + self._model = None # type: Tacotron2 |
| 38 | + checkpoint_state = tf.train.get_checkpoint_state(str(checkpoints_dir)) |
| 39 | + if checkpoint_state is None: |
| 40 | + raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir) |
| 41 | + self.checkpoint_fpath = checkpoint_state.model_checkpoint_path |
| 42 | + if verbose: |
| 43 | + model_name = checkpoints_dir.parent.name.replace("logs-", "") |
| 44 | + step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:]) |
| 45 | + print("Found synthesizer \"%s\" trained to step %d" % (model_name, step)) |
| 46 | + |
| 47 | + def is_loaded(self): |
| 48 | + """ |
| 49 | + Whether the model is loaded in GPU memory. |
| 50 | + """ |
| 51 | + return self._model is not None |
| 52 | + |
| 53 | + def load(self): |
| 54 | + """ |
| 55 | + Effectively loads the model to GPU memory given the weights file that was passed in the |
| 56 | + constructor. |
| 57 | + """ |
| 58 | + if self._low_mem: |
| 59 | + raise Exception("Cannot load the synthesizer permanently in low mem mode") |
| 60 | + tf.reset_default_graph() |
| 61 | + self._model = Tacotron2(self.checkpoint_fpath, self.hparams) |
| 62 | + |
| 63 | + def synthesize_spectrograms(self, texts: List[str], |
| 64 | + embeddings: Union[np.ndarray, List[np.ndarray]], |
| 65 | + return_alignments=False): |
| 66 | + """ |
| 67 | + Synthesizes mel spectrograms from texts and speaker embeddings. |
| 68 | +
|
| 69 | + :param texts: a list of N text prompts to be synthesized |
| 70 | + :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) |
| 71 | + :param return_alignments: if True, a matrix representing the alignments between the |
| 72 | + characters |
| 73 | + and each decoder output step will be returned for each spectrogram |
| 74 | + :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the |
| 75 | + sequence length of spectrogram i, and possibly the alignments. |
| 76 | + """ |
| 77 | + if not self._low_mem: |
| 78 | + # Usual inference mode: load the model on the first request and keep it loaded. |
| 79 | + if not self.is_loaded(): |
| 80 | + self.load() |
| 81 | + specs, alignments = self._model.my_synthesize(embeddings, texts) |
| 82 | + else: |
| 83 | + # Low memory inference mode: load the model upon every request. The model has to be |
| 84 | + # loaded in a separate process to be able to release GPU memory (a simple workaround |
| 85 | + # to tensorflow's intricacies) |
| 86 | + specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms, |
| 87 | + [(self.checkpoint_fpath, embeddings, texts)])[0] |
| 88 | + |
| 89 | + return (specs, alignments) if return_alignments else specs |
| 90 | + |
| 91 | + @staticmethod |
| 92 | + def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts, hparams=None): |
| 93 | + # Load the model and forward the inputs |
| 94 | + hparams = hparams or default_hparams |
| 95 | + tf.reset_default_graph() |
| 96 | + model = Tacotron2(checkpoint_fpath, hparams) |
| 97 | + specs, alignments = model.my_synthesize(embeddings, texts) |
| 98 | + |
| 99 | + # Detach the outputs (not doing so will cause the process to hang) |
| 100 | + specs, alignments = [spec.copy() for spec in specs], alignments.copy() |
| 101 | + |
| 102 | + # Close cuda for this process |
| 103 | + model.session.close() |
| 104 | + numba.cuda.select_device(0) |
| 105 | + numba.cuda.close() |
| 106 | + |
| 107 | + return specs, alignments |
| 108 | + |
| 109 | + @staticmethod |
| 110 | + def load_preprocess_wav(fpath, hparams=None): |
| 111 | + """ |
| 112 | + Loads and preprocesses an audio file under the same conditions the audio files were used to |
| 113 | + train the synthesizer. |
| 114 | + """ |
| 115 | + hparams = hparams or default_hparams |
| 116 | + wav = librosa.load(str(fpath), hparams.sample_rate)[0] |
| 117 | + if hparams.rescale: |
| 118 | + wav = wav / np.abs(wav).max() * hparams.rescaling_max |
| 119 | + return wav |
| 120 | + |
| 121 | + @staticmethod |
| 122 | + def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray], hparams=None): |
| 123 | + """ |
| 124 | + Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that |
| 125 | + were fed to the synthesizer when training. |
| 126 | + """ |
| 127 | + hparams = hparams or default_hparams |
| 128 | + if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): |
| 129 | + wav = Synthesizer.load_preprocess_wav(fpath_or_wav) |
| 130 | + else: |
| 131 | + wav = fpath_or_wav |
| 132 | + |
| 133 | + mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) |
| 134 | + return mel_spectrogram |
| 135 | + |
| 136 | + @staticmethod |
| 137 | + def griffin_lim(mel, hparams=None): |
| 138 | + """ |
| 139 | + Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built |
| 140 | + with the same parameters present in hparams.py. |
| 141 | + """ |
| 142 | + hparams = hparams or default_hparams |
| 143 | + return audio.inv_melspectrogram(mel, hparams) |
0 commit comments