Skip to content

Commit

Permalink
Updata NSF-HIFIGAN Enhancer
Browse files Browse the repository at this point in the history
  • Loading branch information
ylzz1997 committed Apr 8, 2023
1 parent c160066 commit b624394
Show file tree
Hide file tree
Showing 9 changed files with 819 additions and 28 deletions.
24 changes: 20 additions & 4 deletions inference/infer_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ class F0FilterException(Exception):
class Svc(object):
def __init__(self, net_g_path, config_path,
device=None,
cluster_model_path="logs/44k/kmeans_10000.pt"):
cluster_model_path="logs/44k/kmeans_10000.pt",
nsf_hifigan_enhance = False
):
self.net_g_path = net_g_path
if device is None:
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Expand All @@ -125,11 +127,15 @@ def __init__(self, net_g_path, config_path,
self.target_sample = self.hps_ms.data.sampling_rate
self.hop_size = self.hps_ms.data.hop_length
self.spk2id = self.hps_ms.spk
self.nsf_hifigan_enhance = nsf_hifigan_enhance
# 加载hubert
self.hubert_model = utils.get_hubert_model().to(self.dev)
self.load_model()
if os.path.exists(cluster_model_path):
self.cluster_model = cluster.get_cluster_model(cluster_model_path)
if self.nsf_hifigan_enhance:
from modules.enhancer import Enhancer
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)

def load_model(self):
# 获取模型配置
Expand Down Expand Up @@ -185,7 +191,8 @@ def infer(self, speaker, tran, raw_path,
auto_predict_f0=False,
noice_scale=0.4,
f0_filter=False,
F0_mean_pooling=False
F0_mean_pooling=False,
enhancer_adaptive_key = 0
):

speaker_id = self.spk2id.__dict__.get(speaker)
Expand All @@ -199,6 +206,13 @@ def infer(self, speaker, tran, raw_path,
with torch.no_grad():
start = time.time()
audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
if self.nsf_hifigan_enhance:
audio, _ = self.enhancer.enhance(
audio[None,:],
self.target_sample,
f0[:,:,None],
self.hps_ms.data.hop_length,
adaptive_key = enhancer_adaptive_key)
use_time = time.time() - start
print("vits use time:{}".format(use_time))
return audio, audio.shape[-1]
Expand All @@ -219,7 +233,8 @@ def slice_inference(self,
clip_seconds=0,
lg_num=0,
lgr_num =0.75,
F0_mean_pooling = False
F0_mean_pooling = False,
enhancer_adaptive_key = 0
):
wav_path = raw_audio_path
chunks = slicer.cut(wav_path, db_thresh=slice_db)
Expand Down Expand Up @@ -258,7 +273,8 @@ def slice_inference(self,
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noice_scale=noice_scale,
F0_mean_pooling = F0_mean_pooling
F0_mean_pooling = F0_mean_pooling,
enhancer_adaptive_key = enhancer_adaptive_key
)
_audio = out_audio.cpu().numpy()
pad_len = int(self.target_sample * pad_seconds)
Expand Down
19 changes: 13 additions & 6 deletions inference_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,19 @@ def main():
parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比,范围0-1,若没有训练聚类模型则默认0即可')
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
parser.add_argument('-fmp', '--f0_mean_pooling', type=bool, default=False, help='是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭')

parser.add_argument('-eh', '--enhance', type=bool, default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')

# 不用动的部分
parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现')
parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')

parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0')

args = parser.parse_args()

svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path)
infer_tool.mkdir(["raw", "results"])
clean_names = args.clean_names
trans = args.trans
spk_list = args.spk_list
Expand All @@ -62,6 +62,11 @@ def main():
lg = args.linear_gradient
lgr = args.linear_gradient_retain
F0_mean_pooling = args.f0_mean_pooling
enhance = args.enhance
enhancer_adaptive_key = args.enhancer_adaptive_key

svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance)
infer_tool.mkdir(["raw", "results"])

infer_tool.fill_a_to_b(trans, clean_names)
for clean_name, tran in zip(clean_names, trans):
Expand Down Expand Up @@ -107,7 +112,8 @@ def main():
cluster_infer_ratio=cluster_infer_ratio,
auto_predict_f0=auto_predict_f0,
noice_scale=noice_scale,
F0_mean_pooling = F0_mean_pooling
F0_mean_pooling = F0_mean_pooling,
enhancer_adaptive_key = enhancer_adaptive_key
)
_audio = out_audio.cpu().numpy()
pad_len = int(svc_model.target_sample * pad_seconds)
Expand All @@ -125,6 +131,7 @@ def main():
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)

svc_model.clear_empty()

if __name__ == '__main__':
main()
105 changes: 105 additions & 0 deletions modules/enhancer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import numpy as np
import torch
import torch.nn.functional as F
from vdecoder.nsf_hifigan.nvSTFT import STFT
from vdecoder.nsf_hifigan.models import load_model
from torchaudio.transforms import Resample

class Enhancer:
def __init__(self, enhancer_type, enhancer_ckpt, device=None):
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.device = device

if enhancer_type == 'nsf-hifigan':
self.enhancer = NsfHifiGAN(enhancer_ckpt, device=self.device)
else:
raise ValueError(f" [x] Unknown enhancer: {enhancer_type}")

self.resample_kernel = {}
self.enhancer_sample_rate = self.enhancer.sample_rate()
self.enhancer_hop_size = self.enhancer.hop_size()

def enhance(self,
audio, # 1, T
sample_rate,
f0, # 1, n_frames, 1
hop_size,
adaptive_key = 0,
silence_front = 0
):
# enhancer start time
start_frame = int(silence_front * sample_rate / hop_size)
real_silence_front = start_frame * hop_size / sample_rate
audio = audio[:, int(np.round(real_silence_front * sample_rate)) : ]
f0 = f0[: , start_frame :, :]

# adaptive parameters
adaptive_factor = 2 ** ( -adaptive_key / 12)
adaptive_sample_rate = 100 * int(np.round(self.enhancer_sample_rate / adaptive_factor / 100))
real_factor = self.enhancer_sample_rate / adaptive_sample_rate

# resample the ddsp output
if sample_rate == adaptive_sample_rate:
audio_res = audio
else:
key_str = str(sample_rate) + str(adaptive_sample_rate)
if key_str not in self.resample_kernel:
self.resample_kernel[key_str] = Resample(sample_rate, adaptive_sample_rate, lowpass_filter_width = 128).to(self.device)
audio_res = self.resample_kernel[key_str](audio)

n_frames = int(audio_res.size(-1) // self.enhancer_hop_size + 1)

# resample f0
f0_np = f0.squeeze(0).squeeze(-1).cpu().numpy()
f0_np *= real_factor
time_org = (hop_size / sample_rate) * np.arange(len(f0_np)) / real_factor
time_frame = (self.enhancer_hop_size / self.enhancer_sample_rate) * np.arange(n_frames)
f0_res = np.interp(time_frame, time_org, f0_np, left=f0_np[0], right=f0_np[-1])
f0_res = torch.from_numpy(f0_res).unsqueeze(0).float().to(self.device) # 1, n_frames

# enhance
enhanced_audio, enhancer_sample_rate = self.enhancer(audio_res, f0_res)

# resample the enhanced output
if adaptive_factor != 0:
key_str = str(adaptive_sample_rate) + str(enhancer_sample_rate)
if key_str not in self.resample_kernel:
self.resample_kernel[key_str] = Resample(adaptive_sample_rate, enhancer_sample_rate, lowpass_filter_width = 128).to(self.device)
enhanced_audio = self.resample_kernel[key_str](enhanced_audio)

# pad the silence frames
if start_frame > 0:
enhanced_audio = F.pad(enhanced_audio, (int(np.round(enhancer_sample_rate * real_silence_front)), 0))

return enhanced_audio, enhancer_sample_rate


class NsfHifiGAN(torch.nn.Module):
def __init__(self, model_path, device=None):
super().__init__()
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.device = device
print('| Load HifiGAN: ', model_path)
self.model, self.h = load_model(model_path, device=self.device)

def sample_rate(self):
return self.h.sampling_rate

def hop_size(self):
return self.h.hop_size

def forward(self, audio, f0):
stft = STFT(
self.h.sampling_rate,
self.h.num_mels,
self.h.n_fft,
self.h.win_size,
self.h.hop_size,
self.h.fmin,
self.h.fmax)
with torch.no_grad():
mel = stft.get_mel(audio)
enhanced_audio = self.model(mel, f0[:,:mel.size(-1)]).view(-1)
return enhanced_audio, self.h.sampling_rate
Empty file.
15 changes: 15 additions & 0 deletions vdecoder/nsf_hifigan/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os
import shutil


class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self


def build_env(config, config_name, path):
t_path = os.path.join(path, config_name)
if config != t_path:
os.makedirs(path, exist_ok=True)
shutil.copyfile(config, os.path.join(path, config_name))
Loading

0 comments on commit b624394

Please sign in to comment.