forked from xingmegshuo/zhrtvc
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
修复"File contains data in an unknown format"
修改的部分和这个一样, CorentinJ/Real-Time-Voice-Cloning#371, 在eav_fpath前面加上str防止出错
- Loading branch information
Showing
3 changed files
with
567 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import math | ||
import numpy as np | ||
import librosa | ||
import vocoder.hparams as hp | ||
from scipy.signal import lfilter | ||
|
||
|
||
def label_2_float(x, bits) : | ||
return 2 * x / (2**bits - 1.) - 1. | ||
|
||
|
||
def float_2_label(x, bits) : | ||
assert abs(x).max() <= 1.0 | ||
x = (x + 1.) * (2**bits - 1) / 2 | ||
return x.clip(0, 2**bits - 1) | ||
|
||
|
||
def load_wav(path) : | ||
return librosa.load(str(path), sr=hp.sample_rate)[0] | ||
|
||
|
||
def save_wav(x, path) : | ||
librosa.output.write_wav(path, x.astype(np.float32), sr=hp.sample_rate) | ||
|
||
|
||
def split_signal(x) : | ||
unsigned = x + 2**15 | ||
coarse = unsigned // 256 | ||
fine = unsigned % 256 | ||
return coarse, fine | ||
|
||
|
||
def combine_signal(coarse, fine) : | ||
return coarse * 256 + fine - 2**15 | ||
|
||
|
||
def encode_16bits(x) : | ||
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16) | ||
|
||
|
||
mel_basis = None | ||
|
||
|
||
def linear_to_mel(spectrogram): | ||
global mel_basis | ||
if mel_basis is None: | ||
mel_basis = build_mel_basis() | ||
return np.dot(mel_basis, spectrogram) | ||
|
||
|
||
def build_mel_basis(): | ||
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin) | ||
|
||
|
||
def normalize(S): | ||
return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1) | ||
|
||
|
||
def denormalize(S): | ||
return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db | ||
|
||
|
||
def amp_to_db(x): | ||
return 20 * np.log10(np.maximum(1e-5, x)) | ||
|
||
|
||
def db_to_amp(x): | ||
return np.power(10.0, x * 0.05) | ||
|
||
|
||
def spectrogram(y): | ||
D = stft(y) | ||
S = amp_to_db(np.abs(D)) - hp.ref_level_db | ||
return normalize(S) | ||
|
||
|
||
def melspectrogram(y): | ||
D = stft(y) | ||
S = amp_to_db(linear_to_mel(np.abs(D))) | ||
return normalize(S) | ||
|
||
|
||
def stft(y): | ||
return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) | ||
|
||
|
||
def pre_emphasis(x): | ||
return lfilter([1, -hp.preemphasis], [1], x) | ||
|
||
|
||
def de_emphasis(x): | ||
return lfilter([1], [1, -hp.preemphasis], x) | ||
|
||
|
||
def encode_mu_law(x, mu) : | ||
mu = mu - 1 | ||
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu) | ||
return np.floor((fx + 1) / 2 * mu + 0.5) | ||
|
||
|
||
def decode_mu_law(y, mu, from_labels=True) : | ||
if from_labels: | ||
y = label_2_float(y, math.log2(mu)) | ||
mu = mu - 1 | ||
x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1) | ||
return x | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
from synthesizer.tacotron2 import Tacotron2 | ||
from synthesizer.hparams import hparams as default_hparams | ||
# from multiprocess.pool import Pool # You're free to use either one | ||
from multiprocessing import Pool # | ||
from synthesizer.utils import audio | ||
from pathlib import Path | ||
from typing import Union, List | ||
import tensorflow as tf | ||
import numpy as np | ||
import numba.cuda | ||
import librosa | ||
|
||
|
||
class Synthesizer: | ||
sample_rate = default_hparams.sample_rate | ||
hparams = default_hparams | ||
|
||
def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False, hparams=None): | ||
""" | ||
Creates a synthesizer ready for inference. The actual model isn't loaded in memory until | ||
needed or until load() is called. | ||
:param checkpoints_dir: path to the directory containing the checkpoint file as well as the | ||
weight files (.data, .index and .meta files) | ||
:param verbose: if False, only tensorflow's output will be printed TODO: suppress them too | ||
:param low_mem: if True, the model will be loaded in a separate process and its resources | ||
will be released after each usage. Adds a large overhead, only recommended if your GPU | ||
memory is low (<= 2gb) | ||
""" | ||
self.hparams = hparams or default_hparams | ||
self.sample_rate = self.hparams.sample_rate | ||
|
||
self.verbose = verbose | ||
self._low_mem = low_mem | ||
|
||
# Prepare the model | ||
self._model = None # type: Tacotron2 | ||
checkpoint_state = tf.train.get_checkpoint_state(str(checkpoints_dir)) | ||
if checkpoint_state is None: | ||
raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir) | ||
self.checkpoint_fpath = checkpoint_state.model_checkpoint_path | ||
if verbose: | ||
model_name = checkpoints_dir.parent.name.replace("logs-", "") | ||
step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:]) | ||
print("Found synthesizer \"%s\" trained to step %d" % (model_name, step)) | ||
|
||
def is_loaded(self): | ||
""" | ||
Whether the model is loaded in GPU memory. | ||
""" | ||
return self._model is not None | ||
|
||
def load(self): | ||
""" | ||
Effectively loads the model to GPU memory given the weights file that was passed in the | ||
constructor. | ||
""" | ||
if self._low_mem: | ||
raise Exception("Cannot load the synthesizer permanently in low mem mode") | ||
tf.reset_default_graph() | ||
self._model = Tacotron2(self.checkpoint_fpath, self.hparams) | ||
|
||
def synthesize_spectrograms(self, texts: List[str], | ||
embeddings: Union[np.ndarray, List[np.ndarray]], | ||
return_alignments=False): | ||
""" | ||
Synthesizes mel spectrograms from texts and speaker embeddings. | ||
:param texts: a list of N text prompts to be synthesized | ||
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) | ||
:param return_alignments: if True, a matrix representing the alignments between the | ||
characters | ||
and each decoder output step will be returned for each spectrogram | ||
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the | ||
sequence length of spectrogram i, and possibly the alignments. | ||
""" | ||
if not self._low_mem: | ||
# Usual inference mode: load the model on the first request and keep it loaded. | ||
if not self.is_loaded(): | ||
self.load() | ||
specs, alignments = self._model.my_synthesize(embeddings, texts) | ||
else: | ||
# Low memory inference mode: load the model upon every request. The model has to be | ||
# loaded in a separate process to be able to release GPU memory (a simple workaround | ||
# to tensorflow's intricacies) | ||
specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms, | ||
[(self.checkpoint_fpath, embeddings, texts)])[0] | ||
|
||
return (specs, alignments) if return_alignments else specs | ||
|
||
@staticmethod | ||
def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts, hparams=None): | ||
# Load the model and forward the inputs | ||
hparams = hparams or default_hparams | ||
tf.reset_default_graph() | ||
model = Tacotron2(checkpoint_fpath, hparams) | ||
specs, alignments = model.my_synthesize(embeddings, texts) | ||
|
||
# Detach the outputs (not doing so will cause the process to hang) | ||
specs, alignments = [spec.copy() for spec in specs], alignments.copy() | ||
|
||
# Close cuda for this process | ||
model.session.close() | ||
numba.cuda.select_device(0) | ||
numba.cuda.close() | ||
|
||
return specs, alignments | ||
|
||
@staticmethod | ||
def load_preprocess_wav(fpath, hparams=None): | ||
""" | ||
Loads and preprocesses an audio file under the same conditions the audio files were used to | ||
train the synthesizer. | ||
""" | ||
hparams = hparams or default_hparams | ||
wav = librosa.load(str(fpath), hparams.sample_rate)[0] | ||
if hparams.rescale: | ||
wav = wav / np.abs(wav).max() * hparams.rescaling_max | ||
return wav | ||
|
||
@staticmethod | ||
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray], hparams=None): | ||
""" | ||
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that | ||
were fed to the synthesizer when training. | ||
""" | ||
hparams = hparams or default_hparams | ||
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): | ||
wav = Synthesizer.load_preprocess_wav(fpath_or_wav) | ||
else: | ||
wav = fpath_or_wav | ||
|
||
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) | ||
return mel_spectrogram | ||
|
||
@staticmethod | ||
def griffin_lim(mel, hparams=None): | ||
""" | ||
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built | ||
with the same parameters present in hparams.py. | ||
""" | ||
hparams = hparams or default_hparams | ||
return audio.inv_melspectrogram(mel, hparams) |
Oops, something went wrong.