Skip to content

Commit

Permalink
修复"File contains data in an unknown format"
Browse files Browse the repository at this point in the history
修改的部分和这个一样, CorentinJ/Real-Time-Voice-Cloning#371, 在eav_fpath前面加上str防止出错
  • Loading branch information
trojblue authored Jul 30, 2020
1 parent bafcbe3 commit 17b0b87
Show file tree
Hide file tree
Showing 3 changed files with 567 additions and 0 deletions.
107 changes: 107 additions & 0 deletions audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import math
import numpy as np
import librosa
import vocoder.hparams as hp
from scipy.signal import lfilter


def label_2_float(x, bits) :
return 2 * x / (2**bits - 1.) - 1.


def float_2_label(x, bits) :
assert abs(x).max() <= 1.0
x = (x + 1.) * (2**bits - 1) / 2
return x.clip(0, 2**bits - 1)


def load_wav(path) :
return librosa.load(str(path), sr=hp.sample_rate)[0]


def save_wav(x, path) :
librosa.output.write_wav(path, x.astype(np.float32), sr=hp.sample_rate)


def split_signal(x) :
unsigned = x + 2**15
coarse = unsigned // 256
fine = unsigned % 256
return coarse, fine


def combine_signal(coarse, fine) :
return coarse * 256 + fine - 2**15


def encode_16bits(x) :
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)


mel_basis = None


def linear_to_mel(spectrogram):
global mel_basis
if mel_basis is None:
mel_basis = build_mel_basis()
return np.dot(mel_basis, spectrogram)


def build_mel_basis():
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)


def normalize(S):
return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)


def denormalize(S):
return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db


def amp_to_db(x):
return 20 * np.log10(np.maximum(1e-5, x))


def db_to_amp(x):
return np.power(10.0, x * 0.05)


def spectrogram(y):
D = stft(y)
S = amp_to_db(np.abs(D)) - hp.ref_level_db
return normalize(S)


def melspectrogram(y):
D = stft(y)
S = amp_to_db(linear_to_mel(np.abs(D)))
return normalize(S)


def stft(y):
return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)


def pre_emphasis(x):
return lfilter([1, -hp.preemphasis], [1], x)


def de_emphasis(x):
return lfilter([1], [1, -hp.preemphasis], x)


def encode_mu_law(x, mu) :
mu = mu - 1
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
return np.floor((fx + 1) / 2 * mu + 0.5)


def decode_mu_law(y, mu, from_labels=True) :
if from_labels:
y = label_2_float(y, math.log2(mu))
mu = mu - 1
x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
return x

143 changes: 143 additions & 0 deletions inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from synthesizer.tacotron2 import Tacotron2
from synthesizer.hparams import hparams as default_hparams
# from multiprocess.pool import Pool # You're free to use either one
from multiprocessing import Pool #
from synthesizer.utils import audio
from pathlib import Path
from typing import Union, List
import tensorflow as tf
import numpy as np
import numba.cuda
import librosa


class Synthesizer:
sample_rate = default_hparams.sample_rate
hparams = default_hparams

def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False, hparams=None):
"""
Creates a synthesizer ready for inference. The actual model isn't loaded in memory until
needed or until load() is called.
:param checkpoints_dir: path to the directory containing the checkpoint file as well as the
weight files (.data, .index and .meta files)
:param verbose: if False, only tensorflow's output will be printed TODO: suppress them too
:param low_mem: if True, the model will be loaded in a separate process and its resources
will be released after each usage. Adds a large overhead, only recommended if your GPU
memory is low (<= 2gb)
"""
self.hparams = hparams or default_hparams
self.sample_rate = self.hparams.sample_rate

self.verbose = verbose
self._low_mem = low_mem

# Prepare the model
self._model = None # type: Tacotron2
checkpoint_state = tf.train.get_checkpoint_state(str(checkpoints_dir))
if checkpoint_state is None:
raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
if verbose:
model_name = checkpoints_dir.parent.name.replace("logs-", "")
step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))

def is_loaded(self):
"""
Whether the model is loaded in GPU memory.
"""
return self._model is not None

def load(self):
"""
Effectively loads the model to GPU memory given the weights file that was passed in the
constructor.
"""
if self._low_mem:
raise Exception("Cannot load the synthesizer permanently in low mem mode")
tf.reset_default_graph()
self._model = Tacotron2(self.checkpoint_fpath, self.hparams)

def synthesize_spectrograms(self, texts: List[str],
embeddings: Union[np.ndarray, List[np.ndarray]],
return_alignments=False):
"""
Synthesizes mel spectrograms from texts and speaker embeddings.
:param texts: a list of N text prompts to be synthesized
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
:param return_alignments: if True, a matrix representing the alignments between the
characters
and each decoder output step will be returned for each spectrogram
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
sequence length of spectrogram i, and possibly the alignments.
"""
if not self._low_mem:
# Usual inference mode: load the model on the first request and keep it loaded.
if not self.is_loaded():
self.load()
specs, alignments = self._model.my_synthesize(embeddings, texts)
else:
# Low memory inference mode: load the model upon every request. The model has to be
# loaded in a separate process to be able to release GPU memory (a simple workaround
# to tensorflow's intricacies)
specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms,
[(self.checkpoint_fpath, embeddings, texts)])[0]

return (specs, alignments) if return_alignments else specs

@staticmethod
def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts, hparams=None):
# Load the model and forward the inputs
hparams = hparams or default_hparams
tf.reset_default_graph()
model = Tacotron2(checkpoint_fpath, hparams)
specs, alignments = model.my_synthesize(embeddings, texts)

# Detach the outputs (not doing so will cause the process to hang)
specs, alignments = [spec.copy() for spec in specs], alignments.copy()

# Close cuda for this process
model.session.close()
numba.cuda.select_device(0)
numba.cuda.close()

return specs, alignments

@staticmethod
def load_preprocess_wav(fpath, hparams=None):
"""
Loads and preprocesses an audio file under the same conditions the audio files were used to
train the synthesizer.
"""
hparams = hparams or default_hparams
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
return wav

@staticmethod
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray], hparams=None):
"""
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
were fed to the synthesizer when training.
"""
hparams = hparams or default_hparams
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
else:
wav = fpath_or_wav

mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
return mel_spectrogram

@staticmethod
def griffin_lim(mel, hparams=None):
"""
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
with the same parameters present in hparams.py.
"""
hparams = hparams or default_hparams
return audio.inv_melspectrogram(mel, hparams)
Loading

0 comments on commit 17b0b87

Please sign in to comment.