Skip to content

Commit

Permalink
Update: GPT-SoVITS v2 (#186)
Browse files Browse the repository at this point in the history
* Update: GPT-SoVITS v2 support

Share the g2pw code with Bert-Vits2 but use a different BERT model.

* Update: polyphonic for g2pw
  • Loading branch information
Artrajz authored Jan 2, 2025
1 parent b16654f commit 7c64999
Show file tree
Hide file tree
Showing 54 changed files with 121,328 additions and 114,896 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ phrases_dict.txt
/data/bert/vits_chinese_bert/prosody_model.pt
/data/emotional/dimensional_emotion_npy/
/data/models/
/vits/text/chinese_dialect_lexicons
/vits/text/chinese_dialect_lexicons
/data/polyphonic.yaml
2 changes: 2 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from utils.data_utils import clean_folder, check_is_none
from utils.phrases_dict import phrases_dict_init
from module import polyphonic
from tts_app.frontend.views import frontend
from tts_app.voice_api.views import voice_api
from tts_app.auth.views import auth
Expand All @@ -25,6 +26,7 @@
# app.config.update(config)

phrases_dict_init()
polyphonic.load_polyphonic()

csrf = CSRFProtect(app)
# 禁用tts api请求的CSRF防护
Expand Down
6 changes: 3 additions & 3 deletions bert_vits2/bert_vits2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, vits_path, config, device=torch.device("cpu"), **kwargs):
self.ja_bert_extra = False
self.ja_bert_dim = 1024
self.num_tones = num_tones
self.pinyinPlus = None
self.pinyin_g2pw = None

# Compatible with legacy versions
self.version = process_legacy_versions(self.hps_ms).lower().replace("-", "_")
Expand Down Expand Up @@ -208,7 +208,7 @@ def load_model(self, model_handler):
Synthesizer = SynthesizerTrn

if self.version == "2.4":
self.pinyinPlus = self.model_handler.get_pinyinPlus()
self.pinyin_g2pw = self.model_handler.get_pinyin_g2pw()
self.net_g = Synthesizer(
len(self.symbols),
self.hps_ms.data.filter_length // 2 + 1,
Expand All @@ -231,7 +231,7 @@ def get_text(self, text, language_str: str, hps, style_text=None, style_weight=0

tokenizer, _ = self.model_handler.get_bert_model(self.bert_model_names[language_str])

norm_text, phone, tone, word2ph = clean_text(text, clean_text_lang_str, tokenizer, self.pinyinPlus)
norm_text, phone, tone, word2ph = clean_text(text, clean_text_lang_str, tokenizer, self.pinyin_g2pw)

phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, self._symbol_to_id)

Expand Down
5 changes: 0 additions & 5 deletions bert_vits2/g2pW/pypinyin_G2pW_bv2/__init__.py

This file was deleted.

8 changes: 6 additions & 2 deletions bert_vits2/text/chinese_v240.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

import cn2an

from module import polyphonic

normalizer = lambda x: cn2an.transform(x, "an2cn")

current_file_path = os.path.dirname(__file__)
Expand Down Expand Up @@ -114,12 +116,14 @@ def _g2p(segments, pinyinPlus, **kwargs):
currentIndex = 0
for word, pos in seg_cut:
curr_orig_initials = orig_initials[currentIndex: currentIndex + len(word)]
curr_orig_finalss = orig_finals[currentIndex: currentIndex + len(word)]
curr_orig_finals = orig_finals[currentIndex: currentIndex + len(word)]
curr_orig_initials, curr_orig_finals = polyphonic.correct_pronunciation(word, (curr_orig_initials,
curr_orig_finals), style=2)
currentIndex = currentIndex + len(word)
if pos == "eng":
continue
sub_initials, sub_finals = _get_initials_finalsV2(
word, curr_orig_initials, curr_orig_finalss
word, curr_orig_initials, curr_orig_finals
)
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
initials.append(sub_initials)
Expand Down
11 changes: 8 additions & 3 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
MAX_CONTENT_LENGTH = 5242880

# Absolute path of vits-simple-api (current program root path)
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
BASE_DIR: str = os.path.dirname(os.path.realpath(__file__))
# Configuration file path
CONFIG_PATH = os.path.join(BASE_DIR, "config.yaml")
CONFIG_PATH: str = os.path.join(BASE_DIR, "config.yaml")

# WTForms CSRF
SECRET_KEY = secrets.token_hex(16)
SECRET_KEY: str = secrets.token_hex(16)


def update_nested_dict(original, updates):
Expand Down Expand Up @@ -260,6 +260,10 @@ class LanguageIdentification(BaseModel):
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'


class Polyphonic(BaseModel):
dict_path: str = "polyphonic.yaml"


class User(BaseModel):
id: int = 0
username: str = Field(
Expand Down Expand Up @@ -293,6 +297,7 @@ class Config(BaseModel):
system: System = System()
log_config: LogConfig = LogConfig()
language_identification: LanguageIdentification = LanguageIdentification()
polyphonic: Polyphonic = Polyphonic()
reading_config: ReadingConfig = ReadingConfig()
vits_config: VitsConfig = VitsConfig()
w2v2_vits_config: W2V2VitsConfig = W2V2VitsConfig()
Expand Down
53 changes: 53 additions & 0 deletions data/polyphonic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
'polyphonic':
'地壳': ['di4', 'qiao4']
'柏树': ['bai3', 'shu4']
'曝光': ['bao4', 'guang1']
'弹力': ['tan2', 'li4']
'字帖': ['zi4', 'tie4']
'包扎': ['bao1', 'za1']
'哪吒': ['ne2', 'zha1']
'说服': ['shuo1', 'fu2']
'骨头': ['gu3', 'tou5']
'口供': ['kou3', 'gong4']
'抹布': ['ma1', 'bu4']
'眼眶': ['yan3', 'kuang4']
'品行': ['pin3', 'xing2']
'颤抖': ['chan4', 'dou3']
'鸭绿江': ['ya1', 'lu4', 'jiang1']
'撒切尔': ['sa4', 'qie4', 'er3']
'身无长物': ['shen1', 'wu2', 'chang2', 'wu4']
'关卡': ['guan1', 'qia3']
'怀揣': ['huai2', 'chuai1']
'供称': ['gong4', 'cheng1']
'作坊': ['zuo1', 'fang5']
'嚷嚷': ['rang1', 'rang5']
'商厦': ['shang1', 'sha4']
'大厦': ['da4', 'sha4']
'刹车': ['sha1', 'che1']
'嘚瑟': ['de4', 'se5']
'朝鲜': ['chao2', 'xian3']
'阿房宫': ['e1', 'pang2', 'gong1']
'阿胶': ['e1', 'jiao1']
'咖喱': ['ga1', 'li5']
'时分': ['shi2', 'fen1']
'蚌埠': ['beng4', 'bu4']
'驯服': ['xun4', 'fu2']
'幸免于难': ['xing4', 'mian3', 'yu2', 'nan4']
'恶行': ['e4', 'xing2']
'扎实': ['zha1', 'shi2']
'干将': ['gan4', 'jiang4']
'抗住': ['kang2', 'zhu4']
'着想': ['zhuo2', 'xiang3']
'泄露': ['xie4', 'lou4']
'亵玩': ['xie4', 'wan2']
'诸葛': ['zhu1', 'ge3']
'真无马邪': ['zhen1', 'wu2', 'ma3', 'ye2']
'胶鬲': ['jiao1', 'ge2']
'熟悉': ['shu2', 'xi1']
'不太熟悉': ['bu2', 'tai4', 'shu2', 'xi1']
'爪牙': ['zhao3', 'ya2']
'无长': ['wu2', 'zhang3']
'无少': ['wu2', 'shao4']
'倒是': ['dao4', 'shi4']
'': ['duo2']
'背着手': ['bei4', 'zhe3', 'shou3']
10 changes: 6 additions & 4 deletions gpt_sovits/AR/models/t2s_lightning_module.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
# reference: https://github.com/lifeiteng/vall-e
import os, sys

now_dir = os.getcwd()
Expand All @@ -13,11 +14,11 @@


class Text2SemanticLightningModule(LightningModule):
def __init__(self, config, output_dir, is_train=True, flash_attn_enabled: bool = False):
def __init__(self, config, output_dir, is_train=True):
super().__init__()
self.config = config
self.top_k = 3
self.model = Text2SemanticDecoder(config=config, top_k=self.top_k, flash_attn_enabled=flash_attn_enabled)
self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
pretrained_s1 = config.get("pretrained_s1")
if pretrained_s1 and is_train:
# print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
Expand All @@ -35,7 +36,8 @@ def __init__(self, config, output_dir, is_train=True, flash_attn_enabled: bool =
def training_step(self, batch: Dict, batch_idx: int):
opt = self.optimizers()
scheduler = self.lr_schedulers()
loss, acc = self.model.forward(
forward = self.model.forward if self.config["train"].get("if_dpo", False) == True else self.model.forward_old
loss, acc = forward(
batch["phoneme_ids"],
batch["phoneme_ids_len"],
batch["semantic_ids"],
Expand Down
3 changes: 2 additions & 1 deletion gpt_sovits/AR/models/t2s_lightning_module_onnx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/model/t2s_lightning_module.py
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
# reference: https://github.com/lifeiteng/vall-e
import os, sys

now_dir = os.getcwd()
Expand Down
Loading

0 comments on commit 7c64999

Please sign in to comment.