Skip to content

Commit e7fca9a

Browse files
authored
修复"File contains data in an unknown format"
修改的部分和这个一样, CorentinJ/Real-Time-Voice-Cloning#371, 在wav_fpath前面加上str防止在打开文件的时候出现"File contains data in an unknown format"
1 parent bafcbe3 commit e7fca9a

File tree

2 files changed

+460
-460
lines changed

2 files changed

+460
-460
lines changed

zhrtvc/synthesizer/inference.py

+143-143
Original file line numberDiff line numberDiff line change
@@ -1,143 +1,143 @@
1-
from synthesizer.tacotron2 import Tacotron2
2-
from synthesizer.hparams import hparams as default_hparams
3-
# from multiprocess.pool import Pool # You're free to use either one
4-
from multiprocessing import Pool #
5-
from synthesizer.utils import audio
6-
from pathlib import Path
7-
from typing import Union, List
8-
import tensorflow as tf
9-
import numpy as np
10-
import numba.cuda
11-
import librosa
12-
13-
14-
class Synthesizer:
15-
sample_rate = default_hparams.sample_rate
16-
hparams = default_hparams
17-
18-
def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False, hparams=None):
19-
"""
20-
Creates a synthesizer ready for inference. The actual model isn't loaded in memory until
21-
needed or until load() is called.
22-
23-
:param checkpoints_dir: path to the directory containing the checkpoint file as well as the
24-
weight files (.data, .index and .meta files)
25-
:param verbose: if False, only tensorflow's output will be printed TODO: suppress them too
26-
:param low_mem: if True, the model will be loaded in a separate process and its resources
27-
will be released after each usage. Adds a large overhead, only recommended if your GPU
28-
memory is low (<= 2gb)
29-
"""
30-
self.hparams = hparams or default_hparams
31-
self.sample_rate = self.hparams.sample_rate
32-
33-
self.verbose = verbose
34-
self._low_mem = low_mem
35-
36-
# Prepare the model
37-
self._model = None # type: Tacotron2
38-
checkpoint_state = tf.train.get_checkpoint_state(str(checkpoints_dir))
39-
if checkpoint_state is None:
40-
raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
41-
self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
42-
if verbose:
43-
model_name = checkpoints_dir.parent.name.replace("logs-", "")
44-
step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
45-
print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
46-
47-
def is_loaded(self):
48-
"""
49-
Whether the model is loaded in GPU memory.
50-
"""
51-
return self._model is not None
52-
53-
def load(self):
54-
"""
55-
Effectively loads the model to GPU memory given the weights file that was passed in the
56-
constructor.
57-
"""
58-
if self._low_mem:
59-
raise Exception("Cannot load the synthesizer permanently in low mem mode")
60-
tf.reset_default_graph()
61-
self._model = Tacotron2(self.checkpoint_fpath, self.hparams)
62-
63-
def synthesize_spectrograms(self, texts: List[str],
64-
embeddings: Union[np.ndarray, List[np.ndarray]],
65-
return_alignments=False):
66-
"""
67-
Synthesizes mel spectrograms from texts and speaker embeddings.
68-
69-
:param texts: a list of N text prompts to be synthesized
70-
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
71-
:param return_alignments: if True, a matrix representing the alignments between the
72-
characters
73-
and each decoder output step will be returned for each spectrogram
74-
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
75-
sequence length of spectrogram i, and possibly the alignments.
76-
"""
77-
if not self._low_mem:
78-
# Usual inference mode: load the model on the first request and keep it loaded.
79-
if not self.is_loaded():
80-
self.load()
81-
specs, alignments = self._model.my_synthesize(embeddings, texts)
82-
else:
83-
# Low memory inference mode: load the model upon every request. The model has to be
84-
# loaded in a separate process to be able to release GPU memory (a simple workaround
85-
# to tensorflow's intricacies)
86-
specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms,
87-
[(self.checkpoint_fpath, embeddings, texts)])[0]
88-
89-
return (specs, alignments) if return_alignments else specs
90-
91-
@staticmethod
92-
def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts, hparams=None):
93-
# Load the model and forward the inputs
94-
hparams = hparams or default_hparams
95-
tf.reset_default_graph()
96-
model = Tacotron2(checkpoint_fpath, hparams)
97-
specs, alignments = model.my_synthesize(embeddings, texts)
98-
99-
# Detach the outputs (not doing so will cause the process to hang)
100-
specs, alignments = [spec.copy() for spec in specs], alignments.copy()
101-
102-
# Close cuda for this process
103-
model.session.close()
104-
numba.cuda.select_device(0)
105-
numba.cuda.close()
106-
107-
return specs, alignments
108-
109-
@staticmethod
110-
def load_preprocess_wav(fpath, hparams=None):
111-
"""
112-
Loads and preprocesses an audio file under the same conditions the audio files were used to
113-
train the synthesizer.
114-
"""
115-
hparams = hparams or default_hparams
116-
wav = librosa.load(fpath, hparams.sample_rate)[0]
117-
if hparams.rescale:
118-
wav = wav / np.abs(wav).max() * hparams.rescaling_max
119-
return wav
120-
121-
@staticmethod
122-
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray], hparams=None):
123-
"""
124-
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
125-
were fed to the synthesizer when training.
126-
"""
127-
hparams = hparams or default_hparams
128-
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
129-
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
130-
else:
131-
wav = fpath_or_wav
132-
133-
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
134-
return mel_spectrogram
135-
136-
@staticmethod
137-
def griffin_lim(mel, hparams=None):
138-
"""
139-
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
140-
with the same parameters present in hparams.py.
141-
"""
142-
hparams = hparams or default_hparams
143-
return audio.inv_melspectrogram(mel, hparams)
1+
from synthesizer.tacotron2 import Tacotron2
2+
from synthesizer.hparams import hparams as default_hparams
3+
# from multiprocess.pool import Pool # You're free to use either one
4+
from multiprocessing import Pool #
5+
from synthesizer.utils import audio
6+
from pathlib import Path
7+
from typing import Union, List
8+
import tensorflow as tf
9+
import numpy as np
10+
import numba.cuda
11+
import librosa
12+
13+
14+
class Synthesizer:
15+
sample_rate = default_hparams.sample_rate
16+
hparams = default_hparams
17+
18+
def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False, hparams=None):
19+
"""
20+
Creates a synthesizer ready for inference. The actual model isn't loaded in memory until
21+
needed or until load() is called.
22+
23+
:param checkpoints_dir: path to the directory containing the checkpoint file as well as the
24+
weight files (.data, .index and .meta files)
25+
:param verbose: if False, only tensorflow's output will be printed TODO: suppress them too
26+
:param low_mem: if True, the model will be loaded in a separate process and its resources
27+
will be released after each usage. Adds a large overhead, only recommended if your GPU
28+
memory is low (<= 2gb)
29+
"""
30+
self.hparams = hparams or default_hparams
31+
self.sample_rate = self.hparams.sample_rate
32+
33+
self.verbose = verbose
34+
self._low_mem = low_mem
35+
36+
# Prepare the model
37+
self._model = None # type: Tacotron2
38+
checkpoint_state = tf.train.get_checkpoint_state(str(checkpoints_dir))
39+
if checkpoint_state is None:
40+
raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
41+
self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
42+
if verbose:
43+
model_name = checkpoints_dir.parent.name.replace("logs-", "")
44+
step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
45+
print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
46+
47+
def is_loaded(self):
48+
"""
49+
Whether the model is loaded in GPU memory.
50+
"""
51+
return self._model is not None
52+
53+
def load(self):
54+
"""
55+
Effectively loads the model to GPU memory given the weights file that was passed in the
56+
constructor.
57+
"""
58+
if self._low_mem:
59+
raise Exception("Cannot load the synthesizer permanently in low mem mode")
60+
tf.reset_default_graph()
61+
self._model = Tacotron2(self.checkpoint_fpath, self.hparams)
62+
63+
def synthesize_spectrograms(self, texts: List[str],
64+
embeddings: Union[np.ndarray, List[np.ndarray]],
65+
return_alignments=False):
66+
"""
67+
Synthesizes mel spectrograms from texts and speaker embeddings.
68+
69+
:param texts: a list of N text prompts to be synthesized
70+
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
71+
:param return_alignments: if True, a matrix representing the alignments between the
72+
characters
73+
and each decoder output step will be returned for each spectrogram
74+
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
75+
sequence length of spectrogram i, and possibly the alignments.
76+
"""
77+
if not self._low_mem:
78+
# Usual inference mode: load the model on the first request and keep it loaded.
79+
if not self.is_loaded():
80+
self.load()
81+
specs, alignments = self._model.my_synthesize(embeddings, texts)
82+
else:
83+
# Low memory inference mode: load the model upon every request. The model has to be
84+
# loaded in a separate process to be able to release GPU memory (a simple workaround
85+
# to tensorflow's intricacies)
86+
specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms,
87+
[(self.checkpoint_fpath, embeddings, texts)])[0]
88+
89+
return (specs, alignments) if return_alignments else specs
90+
91+
@staticmethod
92+
def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts, hparams=None):
93+
# Load the model and forward the inputs
94+
hparams = hparams or default_hparams
95+
tf.reset_default_graph()
96+
model = Tacotron2(checkpoint_fpath, hparams)
97+
specs, alignments = model.my_synthesize(embeddings, texts)
98+
99+
# Detach the outputs (not doing so will cause the process to hang)
100+
specs, alignments = [spec.copy() for spec in specs], alignments.copy()
101+
102+
# Close cuda for this process
103+
model.session.close()
104+
numba.cuda.select_device(0)
105+
numba.cuda.close()
106+
107+
return specs, alignments
108+
109+
@staticmethod
110+
def load_preprocess_wav(fpath, hparams=None):
111+
"""
112+
Loads and preprocesses an audio file under the same conditions the audio files were used to
113+
train the synthesizer.
114+
"""
115+
hparams = hparams or default_hparams
116+
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
117+
if hparams.rescale:
118+
wav = wav / np.abs(wav).max() * hparams.rescaling_max
119+
return wav
120+
121+
@staticmethod
122+
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray], hparams=None):
123+
"""
124+
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
125+
were fed to the synthesizer when training.
126+
"""
127+
hparams = hparams or default_hparams
128+
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
129+
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
130+
else:
131+
wav = fpath_or_wav
132+
133+
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
134+
return mel_spectrogram
135+
136+
@staticmethod
137+
def griffin_lim(mel, hparams=None):
138+
"""
139+
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
140+
with the same parameters present in hparams.py.
141+
"""
142+
hparams = hparams or default_hparams
143+
return audio.inv_melspectrogram(mel, hparams)

0 commit comments

Comments
 (0)