From 6b353bfb4256443ac9695d0d355e122606b002b4 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 21 Apr 2024 14:47:13 +0800 Subject: [PATCH] Add jieba for Chinese TTS models (#797) --- CMakeLists.txt | 1 + cmake/cppjieba.cmake | 45 ++++ sherpa-onnx/csrc/CMakeLists.txt | 3 + sherpa-onnx/csrc/cppjieba-test.cc | 144 ++++++++++++ sherpa-onnx/csrc/file-utils.cc | 1 + sherpa-onnx/csrc/jieba-lexicon.cc | 216 ++++++++++++++++++ sherpa-onnx/csrc/jieba-lexicon.h | 47 ++++ sherpa-onnx/csrc/lexicon.cc | 4 +- sherpa-onnx/csrc/offline-tts-vits-impl.h | 18 ++ .../csrc/offline-tts-vits-model-config.cc | 22 +- .../csrc/offline-tts-vits-model-config.h | 5 + .../csrc/offline-tts-vits-model-metadata.h | 4 + sherpa-onnx/csrc/offline-tts-vits-model.cc | 1 + .../csrc/offline-tts-vits-model-config.cc | 10 +- 14 files changed, 513 insertions(+), 8 deletions(-) create mode 100644 cmake/cppjieba.cmake create mode 100644 sherpa-onnx/csrc/cppjieba-test.cc create mode 100644 sherpa-onnx/csrc/jieba-lexicon.cc create mode 100644 sherpa-onnx/csrc/jieba-lexicon.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 332f3b087..f2d2c72b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -260,6 +260,7 @@ if(SHERPA_ONNX_ENABLE_TTS) set(ESPEAK_NG_DIR ${espeak_ng_SOURCE_DIR}) message(STATUS "ESPEAK_NG_DIR: ${ESPEAK_NG_DIR}") include(piper-phonemize) + include(cppjieba) # For Chinese TTS. It is a header-only C++ library endif() add_subdirectory(sherpa-onnx) diff --git a/cmake/cppjieba.cmake b/cmake/cppjieba.cmake new file mode 100644 index 000000000..9ad27d7b5 --- /dev/null +++ b/cmake/cppjieba.cmake @@ -0,0 +1,45 @@ +function(download_cppjieba) + include(FetchContent) + + set(cppjieba_URL "https://github.com/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz") + set(cppjieba_URL2 "https://hub.nuaa.cf/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz") + set(cppjieba_HASH "SHA256=03e5264687f0efaef05487a07d49c3f4c0f743347bfbf825df4b30cc75ac5288") + + # If you don't have access to the Internet, + # please pre-download cppjieba + set(possible_file_locations + $ENV{HOME}/Downloads/cppjieba-sherpa-onnx-2024-04-19.tar.gz + ${CMAKE_SOURCE_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz + ${CMAKE_BINARY_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz + /tmp/cppjieba-sherpa-onnx-2024-04-19.tar.gz + /star-fj/fangjun/download/github/cppjieba-sherpa-onnx-2024-04-19.tar.gz + ) + + foreach(f IN LISTS possible_file_locations) + if(EXISTS ${f}) + set(cppjieba_URL "${f}") + file(TO_CMAKE_PATH "${cppjieba_URL}" cppjieba_URL) + message(STATUS "Found local downloaded cppjieba: ${cppjieba_URL}") + set(cppjieba_URL2) + break() + endif() + endforeach() + + FetchContent_Declare(cppjieba + URL + ${cppjieba_URL} + ${cppjieba_URL2} + URL_HASH + ${cppjieba_HASH} + ) + + FetchContent_GetProperties(cppjieba) + if(NOT cppjieba_POPULATED) + message(STATUS "Downloading cppjieba ${cppjieba_URL}") + FetchContent_Populate(cppjieba) + endif() + message(STATUS "cppjieba is downloaded to ${cppjieba_SOURCE_DIR}") + add_subdirectory(${cppjieba_SOURCE_DIR} ${cppjieba_BINARY_DIR} EXCLUDE_FROM_ALL) +endfunction() + +download_cppjieba() diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 377512b1f..8c9fdbb76 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -132,6 +132,7 @@ list(APPEND sources if(SHERPA_ONNX_ENABLE_TTS) list(APPEND sources + jieba-lexicon.cc lexicon.cc offline-tts-character-frontend.cc offline-tts-impl.cc @@ -184,6 +185,7 @@ endif() if(SHERPA_ONNX_ENABLE_TTS) target_link_libraries(sherpa-onnx-core piper_phonemize) target_link_libraries(sherpa-onnx-core fstfar fst) + target_link_libraries(sherpa-onnx-core cppjieba) endif() if(SHERPA_ONNX_ENABLE_CHECK) @@ -491,6 +493,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) ) if(SHERPA_ONNX_ENABLE_TTS) list(APPEND sherpa_onnx_test_srcs + cppjieba-test.cc piper-phonemize-test.cc ) endif() diff --git a/sherpa-onnx/csrc/cppjieba-test.cc b/sherpa-onnx/csrc/cppjieba-test.cc new file mode 100644 index 000000000..77a856e2e --- /dev/null +++ b/sherpa-onnx/csrc/cppjieba-test.cc @@ -0,0 +1,144 @@ +// sherpa-onnx/csrc/cppjieba-test.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include +#include // NOLINT +#include +#include + +#include "cppjieba/Jieba.hpp" +#include "gtest/gtest.h" +#include "sherpa-onnx/csrc/file-utils.h" +#include "sherpa-onnx/csrc/macros.h" + +namespace sherpa_onnx { + +// Please download dict files form +// https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2 +const char *const kDictPath = "./dict/jieba.dict.utf8"; +const char *const kHmmPath = "./dict/hmm_model.utf8"; +const char *const kUserDictPath = "./dict/user.dict.utf8"; +const char *const kIdfPath = "./dict/idf.utf8"; +const char *const kStopWordPath = "./dict/stop_words.utf8"; + +TEST(CppJieBa, Case1) { + if (!FileExists(kDictPath)) { + SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath); + return; + } + + cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath, + kStopWordPath); + + std::vector words; + std::vector jiebawords; + + std::string s = "他来到了网易杭研大厦"; + std::cout << s << std::endl; + std::cout << "[demo] Cut With HMM" << std::endl; + jieba.Cut(s, words, true); + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; + /* + 他来到了网易杭研大厦 + [demo] Cut With HMM + 他/来到/了/网易/杭研/大厦 + */ + s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; + std::cout << s << std::endl; + std::cout << "[demo] CutForSearch" << std::endl; + jieba.CutForSearch(s, words); + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; + /* + 小明硕士毕业于中国科学院计算所,后在日本京都大学深造 + [demo] CutForSearch + 小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造 + */ + std::cout << "[demo] Insert User Word" << std::endl; + jieba.Cut("男默女泪", words); + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; + jieba.InsertUserWord("男默女泪"); + jieba.Cut("男默女泪", words); + std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; + /* + [demo] Insert User Word + 男默/女泪 + 男默女泪 + */ + std::cout << "[demo] CutForSearch Word With Offset" << std::endl; + jieba.CutForSearch(s, jiebawords, true); + std::cout << jiebawords << std::endl; + /* +[demo] CutForSearch Word With Offset +[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", +"offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, +{"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": +"科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", +"offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, +{"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", +"offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, +{"word": "日本京都大学", "offset": 54}, {"word": " 深造", "offset": 72}] + */ + // see more test at + // https://github.com/yanyiwu/cppjieba/blob/master/test/demo.cpp +} + +TEST(CppJieBa, Case2) { + if (!FileExists(kDictPath)) { + SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath); + return; + } + + cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath, + kStopWordPath); + std::string s = + "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如" + "涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感" + "受着生命的奇迹与温柔"; + std::vector words; + bool is_hmm = true; + jieba.Cut(s, words, is_hmm); + { + std::ostringstream os; + std::string sep = ""; + for (const auto &w : words) { + os << sep << w; + sep = "_"; + } + + std::cout << os.str() << "\n"; + } + /* +当_夜幕降临_,_星光点点_,_伴随_着_微风_拂面_, +_我_在_静谧_中_感受_着_时光_的_流转_, +_思念_如_涟漪_荡漾_,_梦境_如_画卷_展开_,_我_与_自然_融为一体_, +_沉静_在_这_片_宁静_的_美丽_之中_,_感受_着_生命_的_奇迹_与_温柔 + */ + s = "这里有:红的、绿的、蓝的;各种各样的颜色都有!你想要什么呢?测试."; + std::regex punct_re(":|、|;"); + std::string s2 = std::regex_replace(s, punct_re, ","); + + std::regex punct_re2("[.]"); + s2 = std::regex_replace(s2, punct_re2, "。"); + + std::regex punct_re3("[?]"); + s2 = std::regex_replace(s2, punct_re3, "?"); + + std::regex punct_re4("[!]"); + s2 = std::regex_replace(s2, punct_re4, "!"); + std::cout << s << "\n" << s2 << "\n"; + + words.clear(); + jieba.Cut(s2, words, is_hmm); + { + std::ostringstream os; + std::string sep = ""; + for (const auto &w : words) { + os << sep << w; + sep = "_"; + } + + std::cout << os.str() << "\n"; + } +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/file-utils.cc b/sherpa-onnx/csrc/file-utils.cc index 8a64decff..f5cf48f97 100644 --- a/sherpa-onnx/csrc/file-utils.cc +++ b/sherpa-onnx/csrc/file-utils.cc @@ -18,6 +18,7 @@ bool FileExists(const std::string &filename) { void AssertFileExists(const std::string &filename) { if (!FileExists(filename)) { SHERPA_ONNX_LOG(FATAL) << filename << " does not exist!"; + exit(-1); } } diff --git a/sherpa-onnx/csrc/jieba-lexicon.cc b/sherpa-onnx/csrc/jieba-lexicon.cc new file mode 100644 index 000000000..c63fff1c0 --- /dev/null +++ b/sherpa-onnx/csrc/jieba-lexicon.cc @@ -0,0 +1,216 @@ +// sherpa-onnx/csrc/jieba-lexicon.cc +// +// Copyright (c) 2022-2024 Xiaomi Corporation + +#include "sherpa-onnx/csrc/jieba-lexicon.h" + +#include +#include // NOLINT +#include + +#include "cppjieba/Jieba.hpp" +#include "sherpa-onnx/csrc/file-utils.h" +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/text-utils.h" + +namespace sherpa_onnx { + +// implemented in ./lexicon.cc +std::unordered_map ReadTokens(std::istream &is); +std::vector ConvertTokensToIds( + const std::unordered_map &token2id, + const std::vector &tokens); + +class JiebaLexicon::Impl { + public: + Impl(const std::string &lexicon, const std::string &tokens, + const std::string &dict_dir, + const OfflineTtsVitsModelMetaData &meta_data, bool debug) + : meta_data_(meta_data), debug_(debug) { + std::string dict = dict_dir + "/jieba.dict.utf8"; + std::string hmm = dict_dir + "/hmm_model.utf8"; + std::string user_dict = dict_dir + "/user.dict.utf8"; + std::string idf = dict_dir + "/idf.utf8"; + std::string stop_word = dict_dir + "/stop_words.utf8"; + + AssertFileExists(dict); + AssertFileExists(hmm); + AssertFileExists(user_dict); + AssertFileExists(idf); + AssertFileExists(stop_word); + + jieba_ = + std::make_unique(dict, hmm, user_dict, idf, stop_word); + + { + std::ifstream is(tokens); + InitTokens(is); + } + + { + std::ifstream is(lexicon); + InitLexicon(is); + } + } + + std::vector> ConvertTextToTokenIds( + const std::string &text) const { + // see + // https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/text/mandarin.py#L244 + std::regex punct_re{":|、|;"}; + std::string s = std::regex_replace(text, punct_re, ","); + + std::regex punct_re2("[.]"); + s = std::regex_replace(s, punct_re2, "。"); + + std::regex punct_re3("[?]"); + s = std::regex_replace(s, punct_re3, "?"); + + std::regex punct_re4("[!]"); + s = std::regex_replace(s, punct_re4, "!"); + + std::vector words; + bool is_hmm = true; + jieba_->Cut(text, words, is_hmm); + + if (debug_) { + SHERPA_ONNX_LOGE("input text: %s", text.c_str()); + SHERPA_ONNX_LOGE("after replacing punctuations: %s", s.c_str()); + + std::ostringstream os; + std::string sep = ""; + for (const auto &w : words) { + os << sep << w; + sep = "_"; + } + + SHERPA_ONNX_LOGE("after jieba processing: %s", os.str().c_str()); + } + + std::vector> ans; + std::vector this_sentence; + + int32_t blank = token2id_.at(" "); + for (const auto &w : words) { + auto ids = ConvertWordToIds(w); + if (ids.empty()) { + SHERPA_ONNX_LOGE("Ignore OOV '%s'", w.c_str()); + continue; + } + + this_sentence.insert(this_sentence.end(), ids.begin(), ids.end()); + this_sentence.push_back(blank); + + if (w == "。" || w == "!" || w == "?" || w == ",") { + ans.push_back(std::move(this_sentence)); + } + } // for (const auto &w : words) + + if (!this_sentence.empty()) { + ans.push_back(std::move(this_sentence)); + } + + return ans; + } + + private: + std::vector ConvertWordToIds(const std::string &w) const { + if (word2ids_.count(w)) { + return word2ids_.at(w); + } + + if (token2id_.count(w)) { + return {token2id_.at(w)}; + } + + std::vector ans; + + std::vector words = SplitUtf8(w); + for (const auto &word : words) { + if (word2ids_.count(word)) { + auto ids = ConvertWordToIds(word); + ans.insert(ans.end(), ids.begin(), ids.end()); + } + } + + return ans; + } + + void InitTokens(std::istream &is) { + token2id_ = ReadTokens(is); + + std::vector> puncts = { + {",", ","}, {".", "。"}, {"!", "!"}, {"?", "?"}}; + + for (const auto &p : puncts) { + if (token2id_.count(p.first) && !token2id_.count(p.second)) { + token2id_[p.second] = token2id_[p.first]; + } + } + } + + void InitLexicon(std::istream &is) { + std::string word; + std::vector token_list; + std::string line; + std::string phone; + int32_t line_num = 0; + + while (std::getline(is, line)) { + ++line_num; + + std::istringstream iss(line); + + token_list.clear(); + + iss >> word; + ToLowerCase(&word); + + if (word2ids_.count(word)) { + SHERPA_ONNX_LOGE("Duplicated word: %s at line %d:%s. Ignore it.", + word.c_str(), line_num, line.c_str()); + continue; + } + + while (iss >> phone) { + token_list.push_back(std::move(phone)); + } + + std::vector ids = ConvertTokensToIds(token2id_, token_list); + if (ids.empty()) { + continue; + } + + word2ids_.insert({std::move(word), std::move(ids)}); + } + } + + private: + // lexicon.txt is saved in word2ids_ + std::unordered_map> word2ids_; + + // tokens.txt is saved in token2id_ + std::unordered_map token2id_; + + OfflineTtsVitsModelMetaData meta_data_; + + std::unique_ptr jieba_; + bool debug_ = false; +}; + +JiebaLexicon::~JiebaLexicon() = default; + +JiebaLexicon::JiebaLexicon(const std::string &lexicon, + const std::string &tokens, + const std::string &dict_dir, + const OfflineTtsVitsModelMetaData &meta_data, + bool debug) + : impl_(std::make_unique(lexicon, tokens, dict_dir, meta_data, + debug)) {} + +std::vector> JiebaLexicon::ConvertTextToTokenIds( + const std::string &text, const std::string &unused_voice /*= ""*/) const { + return impl_->ConvertTextToTokenIds(text); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/jieba-lexicon.h b/sherpa-onnx/csrc/jieba-lexicon.h new file mode 100644 index 000000000..867fa01aa --- /dev/null +++ b/sherpa-onnx/csrc/jieba-lexicon.h @@ -0,0 +1,47 @@ +// sherpa-onnx/csrc/jieba-lexicon.h +// +// Copyright (c) 2022-2024 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_JIEBA_LEXICON_H_ +#define SHERPA_ONNX_CSRC_JIEBA_LEXICON_H_ + +#include +#include +#include +#include + +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#include "sherpa-onnx/csrc/offline-tts-frontend.h" +#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" + +namespace sherpa_onnx { + +class JiebaLexicon : public OfflineTtsFrontend { + public: + ~JiebaLexicon() override; + JiebaLexicon(const std::string &lexicon, const std::string &tokens, + const std::string &dict_dir, + const OfflineTtsVitsModelMetaData &meta_data, bool debug); + +#if __ANDROID_API__ >= 9 + JiebaLexicon(AAssetManager *mgr, const std::string &lexicon, + const std::string &tokens, const std::string &dict_dir, + const OfflineTtsVitsModelMetaData &meta_data); +#endif + + std::vector> ConvertTextToTokenIds( + const std::string &text, + const std::string &unused_voice = "") const override; + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_JIEBA_LEXICON_H_ diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index 2f176ddea..3a502c24c 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -76,7 +76,7 @@ static std::vector ProcessHeteronyms( // Note: We don't use SymbolTable here since tokens may contain a blank // in the first column -static std::unordered_map ReadTokens(std::istream &is) { +std::unordered_map ReadTokens(std::istream &is) { std::unordered_map token2id; std::string line; @@ -113,7 +113,7 @@ static std::unordered_map ReadTokens(std::istream &is) { return token2id; } -static std::vector ConvertTokensToIds( +std::vector ConvertTokensToIds( const std::unordered_map &token2id, const std::vector &tokens) { std::vector ids; diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index a873fd8f6..8b0447209 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -19,6 +19,7 @@ #include "fst/extensions/far/far.h" #include "kaldifst/csrc/kaldi-fst-io.h" #include "kaldifst/csrc/text-normalizer.h" +#include "sherpa-onnx/csrc/jieba-lexicon.h" #include "sherpa-onnx/csrc/lexicon.h" #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-tts-character-frontend.h" @@ -290,9 +291,26 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { void InitFrontend() { const auto &meta_data = model_->GetMetaData(); + if (meta_data.jieba && config_.model.vits.dict_dir.empty()) { + SHERPA_ONNX_LOGE( + "Please provide --vits-dict-dir for Chinese TTS models using jieba"); + exit(-1); + } + + if (!meta_data.jieba && !config_.model.vits.dict_dir.empty()) { + SHERPA_ONNX_LOGE( + "Current model is not using jieba but you provided --vits-dict-dir"); + exit(-1); + } + if (meta_data.frontend == "characters") { frontend_ = std::make_unique( config_.model.vits.tokens, meta_data); + } else if (meta_data.jieba && !config_.model.vits.dict_dir.empty()) { + frontend_ = std::make_unique( + config_.model.vits.lexicon, config_.model.vits.tokens, + config_.model.vits.dict_dir, model_->GetMetaData(), + config_.model.debug); } else if ((meta_data.is_piper || meta_data.is_coqui || meta_data.is_icefall) && !config_.model.vits.data_dir.empty()) { diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc index 3d35726fe..e6195b4f9 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc @@ -4,6 +4,8 @@ #include "sherpa-onnx/csrc/offline-tts-vits-model-config.h" +#include + #include "sherpa-onnx/csrc/file-utils.h" #include "sherpa-onnx/csrc/macros.h" @@ -16,6 +18,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { po->Register("vits-data-dir", &data_dir, "Path to the directory containing dict for espeak-ng. If it is " "given, --vits-lexicon is ignored."); + po->Register("vits-dict-dir", &dict_dir, + "Path to the directory containing dict for jieba. Used only for " + "Chinese TTS models using jieba"); po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models"); po->Register("vits-noise-scale-w", &noise_scale_w, "noise_scale_w for VITS models"); @@ -64,12 +69,24 @@ bool OfflineTtsVitsModelConfig::Validate() const { } if (!FileExists(data_dir + "/intonations")) { - SHERPA_ONNX_LOGE("%s/intonations does not exist. Skipping test", - data_dir.c_str()); + SHERPA_ONNX_LOGE("%s/intonations does not exist.", data_dir.c_str()); return false; } } + if (!dict_dir.empty()) { + std::vector required_files = { + "jieba.dict.utf8", "hmm_model.utf8", "user.dict.utf8", + "idf.utf8", "stop_words.utf8", + }; + + for (const auto &f : required_files) { + if (!FileExists(dict_dir + "/" + f)) { + SHERPA_ONNX_LOGE("%s/%s does not exist.", data_dir.c_str(), f.c_str()); + return false; + } + } + } return true; } @@ -81,6 +98,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const { os << "lexicon=\"" << lexicon << "\", "; os << "tokens=\"" << tokens << "\", "; os << "data_dir=\"" << data_dir << "\", "; + os << "dict_dir=\"" << dict_dir << "\", "; os << "noise_scale=" << noise_scale << ", "; os << "noise_scale_w=" << noise_scale_w << ", "; os << "length_scale=" << length_scale << ")"; diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-config.h b/sherpa-onnx/csrc/offline-tts-vits-model-config.h index cde8b3920..09f00b8b2 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-config.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model-config.h @@ -20,6 +20,9 @@ struct OfflineTtsVitsModelConfig { // data_dir is for piper-phonemize, which uses espeak-ng std::string data_dir; + // Used for Chinese TTS models using jieba + std::string dict_dir; + float noise_scale = 0.667; float noise_scale_w = 0.8; float length_scale = 1; @@ -33,12 +36,14 @@ struct OfflineTtsVitsModelConfig { const std::string &lexicon, const std::string &tokens, const std::string &data_dir, + const std::string &dict_dir, float noise_scale = 0.667, float noise_scale_w = 0.8, float length_scale = 1) : model(model), lexicon(lexicon), tokens(tokens), data_dir(data_dir), + dict_dir(dict_dir), noise_scale(noise_scale), noise_scale_w(noise_scale_w), length_scale(length_scale) {} diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h b/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h index e4e9d8864..621e0e555 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model-metadata.h @@ -22,6 +22,10 @@ struct OfflineTtsVitsModelMetaData { bool is_coqui = false; bool is_icefall = false; + // for Chinese TTS models from + // https://github.com/Plachtaa/VITS-fast-fine-tuning + int32_t jieba = 0; + // the following options are for models from coqui-ai/TTS int32_t blank_id = 0; int32_t bos_id = 0; diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.cc b/sherpa-onnx/csrc/offline-tts-vits-model.cc index c55e72f5c..d73a453c2 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model.cc @@ -93,6 +93,7 @@ class OfflineTtsVitsModel::Impl { SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(meta_data_.frontend, "frontend", ""); + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.jieba, "jieba", 0); SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.blank_id, "blank_id", 0); SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.bos_id, "bos_id", 0); SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.eos_id, "eos_id", 0); diff --git a/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc index c88c92e0b..c8c60e9c8 100644 --- a/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc +++ b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc @@ -16,15 +16,17 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) { py::class_(*m, "OfflineTtsVitsModelConfig") .def(py::init<>()) .def(py::init(), + const std::string &, const std::string &, + const std::string &, float, float, float>(), py::arg("model"), py::arg("lexicon"), py::arg("tokens"), - py::arg("data_dir") = "", py::arg("noise_scale") = 0.667, - py::arg("noise_scale_w") = 0.8, py::arg("length_scale") = 1.0) + py::arg("data_dir") = "", py::arg("dict_dir") = "", + py::arg("noise_scale") = 0.667, py::arg("noise_scale_w") = 0.8, + py::arg("length_scale") = 1.0) .def_readwrite("model", &PyClass::model) .def_readwrite("lexicon", &PyClass::lexicon) .def_readwrite("tokens", &PyClass::tokens) .def_readwrite("data_dir", &PyClass::data_dir) + .def_readwrite("dict_dir", &PyClass::dict_dir) .def_readwrite("noise_scale", &PyClass::noise_scale) .def_readwrite("noise_scale_w", &PyClass::noise_scale_w) .def_readwrite("length_scale", &PyClass::length_scale)