Updata NSF-HIFIGAN Enhancer

SayaSS · Apr 8, 2023 · b624394 · b624394
1 parent c160066
commit b624394
Show file tree

Hide file tree

Showing 9 changed files with 819 additions and 28 deletions.
diff --git a/inference/infer_tool.py b/inference/infer_tool.py
@@ -114,7 +114,9 @@ class F0FilterException(Exception):
 class Svc(object):
     def __init__(self, net_g_path, config_path,
                  device=None,
-                 cluster_model_path="logs/44k/kmeans_10000.pt"):
+                 cluster_model_path="logs/44k/kmeans_10000.pt",
+                 nsf_hifigan_enhance = False
+                 ):
         self.net_g_path = net_g_path
         if device is None:
             self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -125,11 +127,15 @@ def __init__(self, net_g_path, config_path,
         self.target_sample = self.hps_ms.data.sampling_rate
         self.hop_size = self.hps_ms.data.hop_length
         self.spk2id = self.hps_ms.spk
+        self.nsf_hifigan_enhance = nsf_hifigan_enhance
         # 加载hubert
         self.hubert_model = utils.get_hubert_model().to(self.dev)
         self.load_model()
         if os.path.exists(cluster_model_path):
             self.cluster_model = cluster.get_cluster_model(cluster_model_path)
+        if self.nsf_hifigan_enhance:
+            from modules.enhancer import Enhancer
+            self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
 
     def load_model(self):
         # 获取模型配置
@@ -185,7 +191,8 @@ def infer(self, speaker, tran, raw_path,
               auto_predict_f0=False,
               noice_scale=0.4,
               f0_filter=False,
-              F0_mean_pooling=False
+              F0_mean_pooling=False,
+              enhancer_adaptive_key = 0
               ):
 
         speaker_id = self.spk2id.__dict__.get(speaker)
@@ -199,6 +206,13 @@ def infer(self, speaker, tran, raw_path,
         with torch.no_grad():
             start = time.time()
             audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
+            if self.nsf_hifigan_enhance:
+                audio, _ = self.enhancer.enhance(
+                                                                        audio[None,:], 
+                                                                        self.target_sample, 
+                                                                        f0[:,:,None], 
+                                                                        self.hps_ms.data.hop_length, 
+                                                                        adaptive_key = enhancer_adaptive_key)
             use_time = time.time() - start
             print("vits use time:{}".format(use_time))
         return audio, audio.shape[-1]
@@ -219,7 +233,8 @@ def slice_inference(self,
                         clip_seconds=0,
                         lg_num=0,
                         lgr_num =0.75,
-                        F0_mean_pooling = False
+                        F0_mean_pooling = False,
+                        enhancer_adaptive_key = 0
                         ):
         wav_path = raw_audio_path
         chunks = slicer.cut(wav_path, db_thresh=slice_db)
@@ -258,7 +273,8 @@ def slice_inference(self,
                                                     cluster_infer_ratio=cluster_infer_ratio,
                                                     auto_predict_f0=auto_predict_f0,
                                                     noice_scale=noice_scale,
-                                                    F0_mean_pooling = F0_mean_pooling
+                                                    F0_mean_pooling = F0_mean_pooling,
+                                                    enhancer_adaptive_key = enhancer_adaptive_key
                                                     )
                 _audio = out_audio.cpu().numpy()
                 pad_len = int(self.target_sample * pad_seconds)

diff --git a/inference_main.py b/inference_main.py
@@ -36,19 +36,19 @@ def main():
     parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比，范围0-1，若没有训练聚类模型则默认0即可')
     parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度，如果强制切片后出现人声不连贯可调整该数值，如果连贯建议采用默认值0，单位为秒')
     parser.add_argument('-fmp', '--f0_mean_pooling', type=bool, default=False, help='是否对F0使用均值滤波器(池化)，对部分哑音有改善。注意，启动该选项会导致推理速度下降，默认关闭')
-
+    parser.add_argument('-eh', '--enhance', type=bool, default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果，但是对训练好的模型有反面效果，默认关闭')
+
     # 不用动的部分
     parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40，嘈杂的音频可以-30，干声保留呼吸可以-50')
     parser.add_argument('-d', '--device', type=str, default=None, help='推理设备，None则为自动选择cpu和gpu')
     parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别，会影响咬字和音质，较为玄学')
     parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数，由于未知原因开头结尾会有异响，pad一小段静音段后就不会出现')
     parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
     parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后，需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例，范围0-1,左开右闭')
-
+    parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0')
+
     args = parser.parse_args()
 
-    svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path)
-    infer_tool.mkdir(["raw", "results"])
     clean_names = args.clean_names
     trans = args.trans
     spk_list = args.spk_list
@@ -62,6 +62,11 @@ def main():
     lg = args.linear_gradient
     lgr = args.linear_gradient_retain
     F0_mean_pooling = args.f0_mean_pooling
+    enhance = args.enhance
+    enhancer_adaptive_key = args.enhancer_adaptive_key
+
+    svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance)
+    infer_tool.mkdir(["raw", "results"])
 
     infer_tool.fill_a_to_b(trans, clean_names)
     for clean_name, tran in zip(clean_names, trans):
@@ -107,7 +112,8 @@ def main():
                                                         cluster_infer_ratio=cluster_infer_ratio,
                                                         auto_predict_f0=auto_predict_f0,
                                                         noice_scale=noice_scale,
-                                                        F0_mean_pooling = F0_mean_pooling
+                                                        F0_mean_pooling = F0_mean_pooling,
+                                                        enhancer_adaptive_key = enhancer_adaptive_key
                                                         )
                     _audio = out_audio.cpu().numpy()
                     pad_len = int(svc_model.target_sample * pad_seconds)
@@ -125,6 +131,7 @@ def main():
             cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
             res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'
             soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
-
+            svc_model.clear_empty()
+
 if __name__ == '__main__':
     main()
diff --git a/modules/enhancer.py b/modules/enhancer.py
@@ -0,0 +1,105 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from vdecoder.nsf_hifigan.nvSTFT import STFT
+from vdecoder.nsf_hifigan.models import load_model
+from torchaudio.transforms import Resample
+
+class Enhancer:
+    def __init__(self, enhancer_type, enhancer_ckpt, device=None):
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = device
+
+        if enhancer_type == 'nsf-hifigan':
+            self.enhancer = NsfHifiGAN(enhancer_ckpt, device=self.device)
+        else:
+            raise ValueError(f" [x] Unknown enhancer: {enhancer_type}")
+
+        self.resample_kernel = {}
+        self.enhancer_sample_rate = self.enhancer.sample_rate()
+        self.enhancer_hop_size = self.enhancer.hop_size()
+
+    def enhance(self,
+                audio, # 1, T
+                sample_rate,
+                f0, # 1, n_frames, 1
+                hop_size,
+                adaptive_key = 0,
+                silence_front = 0
+                ):
+        # enhancer start time 
+        start_frame = int(silence_front * sample_rate / hop_size)
+        real_silence_front = start_frame * hop_size / sample_rate
+        audio = audio[:, int(np.round(real_silence_front * sample_rate)) : ]
+        f0 = f0[: , start_frame :, :]
+
+        # adaptive parameters
+        adaptive_factor = 2 ** ( -adaptive_key / 12)
+        adaptive_sample_rate = 100 * int(np.round(self.enhancer_sample_rate / adaptive_factor / 100))
+        real_factor = self.enhancer_sample_rate / adaptive_sample_rate
+
+        # resample the ddsp output
+        if sample_rate == adaptive_sample_rate:
+            audio_res = audio
+        else:
+            key_str = str(sample_rate) + str(adaptive_sample_rate)
+            if key_str not in self.resample_kernel:
+                self.resample_kernel[key_str] = Resample(sample_rate, adaptive_sample_rate, lowpass_filter_width = 128).to(self.device)
+            audio_res = self.resample_kernel[key_str](audio)
+
+        n_frames = int(audio_res.size(-1) // self.enhancer_hop_size + 1)
+
+        # resample f0
+        f0_np = f0.squeeze(0).squeeze(-1).cpu().numpy()
+        f0_np *= real_factor
+        time_org = (hop_size / sample_rate) * np.arange(len(f0_np)) / real_factor
+        time_frame = (self.enhancer_hop_size / self.enhancer_sample_rate) * np.arange(n_frames)
+        f0_res = np.interp(time_frame, time_org, f0_np, left=f0_np[0], right=f0_np[-1])
+        f0_res = torch.from_numpy(f0_res).unsqueeze(0).float().to(self.device) # 1, n_frames
+
+        # enhance
+        enhanced_audio, enhancer_sample_rate = self.enhancer(audio_res, f0_res)
+
+        # resample the enhanced output
+        if adaptive_factor != 0:
+            key_str = str(adaptive_sample_rate) + str(enhancer_sample_rate)
+            if key_str not in self.resample_kernel:
+                self.resample_kernel[key_str] = Resample(adaptive_sample_rate, enhancer_sample_rate, lowpass_filter_width = 128).to(self.device)
+            enhanced_audio =  self.resample_kernel[key_str](enhanced_audio)
+
+        # pad the silence frames
+        if start_frame > 0:
+            enhanced_audio = F.pad(enhanced_audio, (int(np.round(enhancer_sample_rate * real_silence_front)), 0))
+
+        return enhanced_audio, enhancer_sample_rate
+
+
+class NsfHifiGAN(torch.nn.Module):
+    def __init__(self, model_path, device=None):
+        super().__init__()
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = device
+        print('| Load HifiGAN: ', model_path)
+        self.model, self.h = load_model(model_path, device=self.device)
+
+    def sample_rate(self):
+        return self.h.sampling_rate
+
+    def hop_size(self):
+        return self.h.hop_size
+
+    def forward(self, audio, f0):
+        stft = STFT(
+                self.h.sampling_rate, 
+                self.h.num_mels, 
+                self.h.n_fft, 
+                self.h.win_size, 
+                self.h.hop_size, 
+                self.h.fmin, 
+                self.h.fmax)
+        with torch.no_grad():
+            mel = stft.get_mel(audio)
+            enhanced_audio = self.model(mel, f0[:,:mel.size(-1)]).view(-1)
+            return enhanced_audio, self.h.sampling_rate
diff --git a/pretrain/nsf_hifigan/put_nsf_hifigan_ckpt_here b/pretrain/nsf_hifigan/put_nsf_hifigan_ckpt_here
diff --git a/vdecoder/nsf_hifigan/env.py b/vdecoder/nsf_hifigan/env.py
@@ -0,0 +1,15 @@
+import os
+import shutil
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))