Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tts] fix point bug #2255

Merged
merged 1 commit into from
Aug 19, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 36 additions & 7 deletions paddlespeech/t2s/frontend/mix_frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,31 @@ def is_other(self, char):

def _split(self, text: str) -> List[str]:
text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text)
# 替换英文句子的句号 "." --> "。" 用于后续分句
point = "."
point_indexs = []
index = -1
for i in range(text.count(point)):
index = text.find(".", index + 1, len(text))
point_indexs.append(index)

print(point_indexs)

for point_index in point_indexs:
# 如果点在最开始或者最末尾的位置,不处理
if point_index == 0 or point_index == len(text) - 1:
pass
else:
if ((self.is_alphabet(text[point_index - 1]) or
text[point_index - 1] == " ") and
(self.is_alphabet(text[point_index + 1]) or
text[point_index + 1] == " ")):
text = text.replace(text[point_index], "。")

text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]

return sentences

def _distinguish(self, text: str) -> List[str]:
Expand All @@ -77,9 +99,11 @@ def _distinguish(self, text: str) -> List[str]:
temp_seg = ""
temp_lang = ""

# Determine the type of each character. type: blank, chinese, alphabet, number, unk.
# Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
for ch in text:
if self.is_chinese(ch):
if ch == ".":
types.append("point")
elif self.is_chinese(ch):
types.append("zh")
elif self.is_alphabet(ch):
types.append("en")
Expand All @@ -96,21 +120,26 @@ def _distinguish(self, text: str) -> List[str]:

# find the first char of the seg
if flag == 0:
if types[i] != "unk" and types[i] != "blank":
# 首个字符是中文,英文或者数字
if types[i] == "zh" or types[i] == "en" or types[i] == "num":
temp_seg += text[i]
temp_lang = types[i]
flag = 1

else:
if types[i] == temp_lang or types[i] == "num":
# 数字和小数点均与前面的字符合并,类型属于前面一个字符的类型
if types[i] == temp_lang or types[i] == "num" or types[
i] == "point":
temp_seg += text[i]

elif temp_lang == "num" and types[i] != "unk":
# 数字与后面的任意字符都拼接
elif temp_lang == "num":
temp_seg += text[i]
if types[i] == "zh" or types[i] == "en":
temp_lang = types[i]

elif temp_lang == "en" and types[i] == "blank":
# 如果是空格则与前面字符拼接
elif types[i] == "blank":
temp_seg += text[i]

elif types[i] == "unk":
Expand All @@ -119,7 +148,7 @@ def _distinguish(self, text: str) -> List[str]:
else:
segments.append((temp_seg, temp_lang))

if types[i] != "unk" and types[i] != "blank":
if types[i] == "zh" or types[i] == "en":
temp_seg = text[i]
temp_lang = types[i]
flag = 1
Expand Down