Skip to content

Commit

Permalink
Merge pull request #40 from stealthinu/v1.5onnx化固定長
Browse files Browse the repository at this point in the history
V1.5onnx化固定長
  • Loading branch information
isletennos authored Mar 28, 2023
2 parents 20a5d6a + 35e74a4 commit 068c0df
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 54 deletions.
84 changes: 52 additions & 32 deletions python/mmvc_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,11 +288,17 @@ def launch_model(self):
requires_grad_flow = self.hps.requires_grad.flow,
requires_grad_text_enc = self.hps.requires_grad.text_enc,
requires_grad_dec = self.hps.requires_grad.dec,
requires_grad_emb_g = self.hps.requires_grad.emb_g
requires_grad_emb_g = self.hps.requires_grad.emb_g,
sample_rate = self.hps.data.sampling_rate,
hop_size = self.hps.data.hop_length,
sine_amp = self.hps.data.sine_amp,
noise_amp = self.hps.data.noise_amp,
signal_types = self.hps.data.signal_types,
dense_factors = self.hps.data.dense_factors,
upsample_scales = self.hps.model.upsample_rates,
)
_ = net_g.eval()
#暫定872000
_ = load_checkpoint(Hyperparameters.MODEL_PATH, net_g, None)

return net_g

#f0からcf0を推定する
Expand Down Expand Up @@ -324,6 +330,7 @@ def convert_continuos_f0(self, f0, f0_size):
return f(np.arange(0, f0_size))

def audio_trans(self, tdbm, input, net_g, noise_data, target_id, f0_scale, dispose_stft_specs, dispose_conv1d_specs, ort_session=None):
gpu_id = Hyperparameters.GPU_ID
hop_length = Hyperparameters.HOP_LENGTH
dispose_conv1d_length = dispose_conv1d_specs * hop_length

Expand Down Expand Up @@ -358,38 +365,42 @@ def audio_trans(self, tdbm, input, net_g, noise_data, target_id, f0_scale, dispo
data = TextAudioSpeakerCollate(
sample_rate = Hyperparameters.SAMPLE_RATE,
hop_size = Hyperparameters.HOP_LENGTH,
dense_factors = self.hps.data.dense_factors,
upsample_scales = self.hps.model.upsample_rates,
f0_factor = f0_scale
)([(spec, sid, f0)])

spec, spec_lengths, sid_src, f0 = data
sid_target = torch.LongTensor([target_id]) # 話者IDはJVSの番号を100で割った余りです
if Hyperparameters.USE_ONNX:
spec, spec_lengths, sid_src, f0 = data
sid_target = torch.LongTensor([target_id]) # 話者IDはJVSの番号を100で割った余りです
sin, d = net_g.make_sin_d(f0)
(d0, d1, d2, d3) = d
if spec.size()[2] >= 8:
audio = ort_session.run(
["audio"],
{
"specs": spec.numpy(),
"lengths": spec_lengths.numpy(),
"f0": f0.numpy(),
"sin": sin.numpy(),
"d0": d0.numpy(),
"d1": d1.numpy(),
"d2": d2.numpy(),
"d3": d3.numpy(),
"sid_src": sid_src.numpy(),
"sid_tgt": sid_target.numpy()
})[0][0,0]
else:
audio = np.array([0.0]) # dummy
else:
if Hyperparameters.GPU_ID >= 0:
#spec, spec_lengths, sid_src, sin, d = [x.cuda(Hyperparameters.GPU_ID) for x in data]
spec, spec_lengths, sid_src, f0 = data
spec = spec.cuda(Hyperparameters.GPU_ID)
spec_lengths = spec_lengths.cuda(Hyperparameters.GPU_ID)
sid_src = sid_src.cuda(Hyperparameters.GPU_ID)
sid_target = torch.LongTensor([target_id]).cuda(Hyperparameters.GPU_ID) # 話者IDはJVSの番号を100で割った余りです
f0 = f0.cuda(0)
audio = net_g.cuda(Hyperparameters.GPU_ID).voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0][0,0].data.cpu().float().numpy()
if gpu_id >= 0:
#spec, spec_lengths, sid_src, sin, d = [x.cuda(gpu_id) for x in data]
spec = spec.cuda(gpu_id)
spec_lengths = spec_lengths.cuda(gpu_id)
sid_src = sid_src.cuda(gpu_id)
sid_target = sid_target.cuda(gpu_id) # 話者IDはJVSの番号を100で割った余りです
f0 = f0.cuda(gpu_id)
audio = net_g.cuda(gpu_id).voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0,0].data.cpu().float().numpy()
else:
spec, spec_lengths, sid_src, f0 = data
sid_target = torch.LongTensor([target_id]) # 話者IDはJVSの番号を100で割った余りです
audio = net_g.voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0][0,0].data.cpu().float().numpy()
audio = net_g.voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0,0].data.cpu().float().numpy()

if dispose_conv1d_specs != 0:
# 出力されたwavでconv1d paddingの影響受けるところを削る
Expand Down Expand Up @@ -427,19 +438,21 @@ def overlap_merge(self, now_wav, prev_wav, overlap_length):
def vc_run(self):
audio = pyaudio.PyAudio()
print("モデルを読み込んでいます。少々お待ちください。")
net_g = None
net_g = self.launch_model()
ort_session = None
if Hyperparameters.USE_ONNX :
# DirectMLで動かすための設定
ort_options = ort.SessionOptions()
ort_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
ort_options.enable_mem_pattern = False
#ort_options.enable_profiling = True
ort_session = ort.InferenceSession(
Hyperparameters.MODEL_PATH,
sess_options=ort_options,
providers=Hyperparameters.ONNX_PROVIDERS)
else:
net_g = self.launch_model()
_ = load_checkpoint(Hyperparameters.MODEL_PATH, net_g, None)

print("モデルの読み込みが完了しました。音声の入出力の準備を行います。少々お待ちください。")
tdbm = Transform_Data_By_Model()

Expand Down Expand Up @@ -520,26 +533,31 @@ def vc_run(self):
voice_selector = VoiceSelector()
voice_selector.open_window()

prev_wav_tail = bytes(0)
in_wav = prev_wav_tail + audio_input_stream.read(delay_frames, exception_on_overflow=False)
trans_wav = self.audio_trans(tdbm, in_wav, net_g, noise_data, target_id, f0_scale, 0, 0, ort_session=ort_session) # 遅延減らすため初回だけpadding対策使わない
overlapped_wav = trans_wav
prev_trans_wav = trans_wav
if dispose_length + overlap_length != 0:
prev_wav_tail = in_wav[-((dispose_length + overlap_length) * wav_bytes):] # 次回の頭のデータとして終端データを保持する
if with_bgm:
back_in_raw = back_audio_input_stream.read(delay_frames, exception_on_overflow = False) # 背景BGMを取得
# in_wav: delay_frames * wav_bytes = 4096 * 2 = 8192
# prev_wav_tail: (dispose_length + overlap_length) * wav_bytes = (1536 + 128) * 2 = 3328
# prev_trans_wav: (delay_frames + overlap_length) * wav_bytes = (4096 + 128) * 2 = 8448
prev_wav_tail = bytes((dispose_length + overlap_length) * wav_bytes)
prev_trans_wav = bytes((delay_frames + overlap_length) * wav_bytes)
#prev_wav_tail = bytes(0)
#in_wav = prev_wav_tail + audio_input_stream.read(delay_frames, exception_on_overflow=False)
#trans_wav = self.audio_trans(tdbm, in_wav, net_g, noise_data, target_id, 0, 0, ort_session=ort_session) # 遅延減らすため初回だけpadding対策使わない
#overlapped_wav = trans_wav
#prev_trans_wav = trans_wav
#if dispose_length + overlap_length != 0:
# prev_wav_tail = in_wav[-((dispose_length + overlap_length) * wav_bytes):] # 次回の頭のデータとして終端データを保持する
#if with_bgm:
# back_in_raw = back_audio_input_stream.read(delay_frames, exception_on_overflow = False) # 背景BGMを取得
while True:
audio_output_stream.write(overlapped_wav)
in_wav = prev_wav_tail + audio_input_stream.read(delay_frames, exception_on_overflow=False)
trans_wav = self.audio_trans(tdbm, in_wav, net_g, noise_data, target_id, f0_scale, dispose_stft_specs, dispose_conv1d_specs, ort_session=ort_session)
overlapped_wav = self.overlap_merge(trans_wav, prev_trans_wav, overlap_length)
audio_output_stream.write(overlapped_wav)
prev_trans_wav = trans_wav
if dispose_length + overlap_length != 0:
prev_wav_tail = in_wav[-((dispose_length + overlap_length) * wav_bytes):] # 今回の終端の捨てデータぶんだけ次回の頭のデータとして保持する
if with_bgm:
back_audio_output_stream.write(back_in_raw)
back_in_raw = back_audio_input_stream.read(delay_frames, exception_on_overflow=False) # 背景BGMを取得
back_audio_output_stream.write(back_in_raw)

if with_voice_selector and voice_selector_flag:
target_id = voice_selector.voice_select_id
Expand All @@ -559,6 +577,8 @@ def vc_run(self):
back_audio_output_stream.stop_stream()
back_audio_output_stream.close()
audio.terminate()
#prof_file = ort_session.end_profiling()
#print(prof_file)
print("Stop Streaming")

if with_voice_selector and voice_selector_flag:
Expand Down
48 changes: 26 additions & 22 deletions python/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,15 @@ def __init__(self,
requires_grad_flow=True,
requires_grad_text_enc=True,
requires_grad_dec=True,
requires_grad_emb_g=True,):
requires_grad_emb_g=True,
sample_rate=24000,
hop_size=128,
sine_amp=0.1,
noise_amp=0.003,
signal_types=["sine"],
dense_factors=[0.5, 1, 4, 8],
upsample_scales=[8, 4, 2, 2],
):

super().__init__()
self.spec_channels = spec_channels
Expand All @@ -316,6 +324,13 @@ def __init__(self,
self.requires_grad_text_enc = requires_grad_text_enc
self.requires_grad_dec = requires_grad_dec
self.requires_grad_emb_g = requires_grad_emb_g
self.sample_rate = sample_rate
self.hop_size = hop_size
self.sine_amp = sine_amp
self.noise_amp = noise_amp
self.signal_types = signal_types
self.dense_factors = dense_factors
self.upsample_scales = upsample_scales

self.enc_q = PosteriorEncoder(
spec_channels,
Expand Down Expand Up @@ -348,10 +363,10 @@ def __init__(self,
gin_channels=gin_channels,
requires_grad=requires_grad_flow)
self.signal_generator = SignalGenerator(
sample_rate=24000,
hop_size=128,
noise_amp=0.0,
signal_types=["sine"]
sample_rate=sample_rate,
hop_size=hop_size,
noise_amp=noise_amp,
signal_types=signal_types
)

if n_speakers > 1:
Expand Down Expand Up @@ -396,30 +411,19 @@ def forward(self, x, x_lengths, y, y_lengths, f0, slice_id, sid=None, target_ids

return (o, tgt_o), slice_id, x_mask, y_mask, ((z, z_p, m_p), logs_p, m_q, logs_q)

def make_sin_d(self,
f0,
f0_scale=1.0,
sample_rate=24000,
dense_factors=[0.5, 1, 4, 8],
upsample_scales=[8, 4, 2, 2]
):
def make_sin_d(self, f0):
# f0 から sin と d を作成
# f0 : [b, 1, t]
# sin : [b, 1, t]
# d : [4][b, 1, t]
#dense_factors = torch.tensor(dense_factors).to(device)
#upsample_scales = torch.tensor(upsample_scales).to(device)
#prod_upsample_scales = torch.cumprod(upsample_scales, dim=0)
prod_upsample_scales = np.cumprod(upsample_scales)
prod_upsample_scales = np.cumprod(self.upsample_scales)
dfs_batch = []
for df, us in zip(dense_factors, prod_upsample_scales):
dilated_tensor = dilated_factor(f0, sample_rate, df)
for df, us in zip(self.dense_factors, prod_upsample_scales):
dilated_tensor = dilated_factor(f0, self.sample_rate, df)
#result += [torch.repeat_interleave(dilated_tensor, us, dim=1)]
result = [torch.stack([dilated_tensor for _ in range(us)], -1).reshape(dilated_tensor.shape[0], -1)]
#dfs_batch += [result]
dfs_batch.append(torch.cat(result, dim=0).unsqueeze(1))

in_batch = self.signal_generator(f0 * f0_scale)
in_batch = self.signal_generator(f0)

return in_batch, dfs_batch

Expand All @@ -445,7 +449,7 @@ def voice_conversion(self, y, y_lengths, f0, sid_src, sid_tgt):
z_p = self.flow(z, y_mask, g=g_src)
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
o_hat = self.dec(sin, z_hat * y_mask, d, sid=g_tgt)
return o_hat
return o_hat[0]

def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt):
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
Expand Down

0 comments on commit 068c0df

Please sign in to comment.