Skip to content

Commit

Permalink
Add jieba for Chinese TTS models (#797)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Apr 21, 2024
1 parent 2e0ee0e commit 6b353bf
Show file tree
Hide file tree
Showing 14 changed files with 513 additions and 8 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ if(SHERPA_ONNX_ENABLE_TTS)
set(ESPEAK_NG_DIR ${espeak_ng_SOURCE_DIR})
message(STATUS "ESPEAK_NG_DIR: ${ESPEAK_NG_DIR}")
include(piper-phonemize)
include(cppjieba) # For Chinese TTS. It is a header-only C++ library
endif()

add_subdirectory(sherpa-onnx)
Expand Down
45 changes: 45 additions & 0 deletions cmake/cppjieba.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
function(download_cppjieba)
include(FetchContent)

set(cppjieba_URL "https://github.com/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz")
set(cppjieba_URL2 "https://hub.nuaa.cf/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz")
set(cppjieba_HASH "SHA256=03e5264687f0efaef05487a07d49c3f4c0f743347bfbf825df4b30cc75ac5288")

# If you don't have access to the Internet,
# please pre-download cppjieba
set(possible_file_locations
$ENV{HOME}/Downloads/cppjieba-sherpa-onnx-2024-04-19.tar.gz
${CMAKE_SOURCE_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz
${CMAKE_BINARY_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz
/tmp/cppjieba-sherpa-onnx-2024-04-19.tar.gz
/star-fj/fangjun/download/github/cppjieba-sherpa-onnx-2024-04-19.tar.gz
)

foreach(f IN LISTS possible_file_locations)
if(EXISTS ${f})
set(cppjieba_URL "${f}")
file(TO_CMAKE_PATH "${cppjieba_URL}" cppjieba_URL)
message(STATUS "Found local downloaded cppjieba: ${cppjieba_URL}")
set(cppjieba_URL2)
break()
endif()
endforeach()

FetchContent_Declare(cppjieba
URL
${cppjieba_URL}
${cppjieba_URL2}
URL_HASH
${cppjieba_HASH}
)

FetchContent_GetProperties(cppjieba)
if(NOT cppjieba_POPULATED)
message(STATUS "Downloading cppjieba ${cppjieba_URL}")
FetchContent_Populate(cppjieba)
endif()
message(STATUS "cppjieba is downloaded to ${cppjieba_SOURCE_DIR}")
add_subdirectory(${cppjieba_SOURCE_DIR} ${cppjieba_BINARY_DIR} EXCLUDE_FROM_ALL)
endfunction()

download_cppjieba()
3 changes: 3 additions & 0 deletions sherpa-onnx/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ list(APPEND sources

if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND sources
jieba-lexicon.cc
lexicon.cc
offline-tts-character-frontend.cc
offline-tts-impl.cc
Expand Down Expand Up @@ -184,6 +185,7 @@ endif()
if(SHERPA_ONNX_ENABLE_TTS)
target_link_libraries(sherpa-onnx-core piper_phonemize)
target_link_libraries(sherpa-onnx-core fstfar fst)
target_link_libraries(sherpa-onnx-core cppjieba)
endif()

if(SHERPA_ONNX_ENABLE_CHECK)
Expand Down Expand Up @@ -491,6 +493,7 @@ if(SHERPA_ONNX_ENABLE_TESTS)
)
if(SHERPA_ONNX_ENABLE_TTS)
list(APPEND sherpa_onnx_test_srcs
cppjieba-test.cc
piper-phonemize-test.cc
)
endif()
Expand Down
144 changes: 144 additions & 0 deletions sherpa-onnx/csrc/cppjieba-test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// sherpa-onnx/csrc/cppjieba-test.cc
//
// Copyright (c) 2024 Xiaomi Corporation
#include <iostream>
#include <regex> // NOLINT
#include <string>
#include <vector>

#include "cppjieba/Jieba.hpp"
#include "gtest/gtest.h"
#include "sherpa-onnx/csrc/file-utils.h"
#include "sherpa-onnx/csrc/macros.h"

namespace sherpa_onnx {

// Please download dict files form
// https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
const char *const kDictPath = "./dict/jieba.dict.utf8";
const char *const kHmmPath = "./dict/hmm_model.utf8";
const char *const kUserDictPath = "./dict/user.dict.utf8";
const char *const kIdfPath = "./dict/idf.utf8";
const char *const kStopWordPath = "./dict/stop_words.utf8";

TEST(CppJieBa, Case1) {
if (!FileExists(kDictPath)) {
SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath);
return;
}

cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath,
kStopWordPath);

std::vector<std::string> words;
std::vector<cppjieba::Word> jiebawords;

std::string s = "他来到了网易杭研大厦";
std::cout << s << std::endl;
std::cout << "[demo] Cut With HMM" << std::endl;
jieba.Cut(s, words, true);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
/*
他来到了网易杭研大厦
[demo] Cut With HMM
他/来到/了/网易/杭研/大厦
*/
s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造";
std::cout << s << std::endl;
std::cout << "[demo] CutForSearch" << std::endl;
jieba.CutForSearch(s, words);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
/*
小明硕士毕业于中国科学院计算所,后在日本京都大学深造
[demo] CutForSearch
小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造
*/
std::cout << "[demo] Insert User Word" << std::endl;
jieba.Cut("男默女泪", words);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
jieba.InsertUserWord("男默女泪");
jieba.Cut("男默女泪", words);
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl;
/*
[demo] Insert User Word
男默/女泪
男默女泪
*/
std::cout << "[demo] CutForSearch Word With Offset" << std::endl;
jieba.CutForSearch(s, jiebawords, true);
std::cout << jiebawords << std::endl;
/*
[demo] CutForSearch Word With Offset
[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业",
"offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21},
{"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word":
"科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算",
"offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45},
{"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本",
"offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66},
{"word": "日本京都大学", "offset": 54}, {"word": " 深造", "offset": 72}]
*/
// see more test at
// https://github.com/yanyiwu/cppjieba/blob/master/test/demo.cpp
}

TEST(CppJieBa, Case2) {
if (!FileExists(kDictPath)) {
SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath);
return;
}

cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath,
kStopWordPath);
std::string s =
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如"
"涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感"
"受着生命的奇迹与温柔";
std::vector<std::string> words;
bool is_hmm = true;
jieba.Cut(s, words, is_hmm);
{
std::ostringstream os;
std::string sep = "";
for (const auto &w : words) {
os << sep << w;
sep = "_";
}

std::cout << os.str() << "\n";
}
/*
当_夜幕降临_,_星光点点_,_伴随_着_微风_拂面_,
_我_在_静谧_中_感受_着_时光_的_流转_,
_思念_如_涟漪_荡漾_,_梦境_如_画卷_展开_,_我_与_自然_融为一体_,
_沉静_在_这_片_宁静_的_美丽_之中_,_感受_着_生命_的_奇迹_与_温柔
*/
s = "这里有:红的、绿的、蓝的;各种各样的颜色都有!你想要什么呢?测试.";
std::regex punct_re(":|、|;");
std::string s2 = std::regex_replace(s, punct_re, "");

std::regex punct_re2("[.]");
s2 = std::regex_replace(s2, punct_re2, "");

std::regex punct_re3("[?]");
s2 = std::regex_replace(s2, punct_re3, "");

std::regex punct_re4("[!]");
s2 = std::regex_replace(s2, punct_re4, "");
std::cout << s << "\n" << s2 << "\n";

words.clear();
jieba.Cut(s2, words, is_hmm);
{
std::ostringstream os;
std::string sep = "";
for (const auto &w : words) {
os << sep << w;
sep = "_";
}

std::cout << os.str() << "\n";
}
}

} // namespace sherpa_onnx
1 change: 1 addition & 0 deletions sherpa-onnx/csrc/file-utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ bool FileExists(const std::string &filename) {
void AssertFileExists(const std::string &filename) {
if (!FileExists(filename)) {
SHERPA_ONNX_LOG(FATAL) << filename << " does not exist!";
exit(-1);
}
}

Expand Down
Loading

0 comments on commit 6b353bf

Please sign in to comment.