Revert "change g2p and other fix" (#40)

fishaudio · Sep 30, 2023 · 2f687e4 · 2f687e4
1 parent c3df0ba
commit 2f687e4
Show file tree

Hide file tree

Showing 11 changed files with 253 additions and 75 deletions.
diff --git a/bert/bert-base-japanese-v3/README.md b/bert/bert-base-japanese-v3/README.md
@@ -0,0 +1,53 @@
+---
+license: apache-2.0
+datasets:
+- cc100
+- wikipedia
+language:
+- ja
+widget:
+- text: 東北大学で[MASK]の研究をしています。
+---
+
+# BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
+
+This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
+
+This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
+Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
+
+The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
+
+## Model architecture
+
+The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads.
+
+## Training Data
+
+The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
+For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
+The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
+
+For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
+
+## Tokenization
+
+The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
+The vocabulary size is 32768.
+
+We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
+
+## Training
+
+We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
+For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
+
+For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
+
+## Licenses
+
+The pretrained models are distributed under the Apache License 2.0.
+
+## Acknowledgments
+
+This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.
diff --git a/bert/bert-large-japanese-v2/config.json → bert/bert-base-japanese-v3/config.json b/bert/bert-large-japanese-v2/config.json → bert/bert-base-japanese-v3/config.json
@@ -5,14 +5,14 @@
     "attention_probs_dropout_prob": 0.1,
     "hidden_act": "gelu",
     "hidden_dropout_prob": 0.1,
-    "hidden_size": 1024,
+    "hidden_size": 768,
     "initializer_range": 0.02,
-    "intermediate_size": 4096,
+    "intermediate_size": 3072,
     "layer_norm_eps": 1e-12,
     "max_position_embeddings": 512,
     "model_type": "bert",
-    "num_attention_heads": 16,
-    "num_hidden_layers": 24,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
     "pad_token_id": 0,
     "type_vocab_size": 2,
     "vocab_size": 32768

diff --git a/bert/bert-large-japanese-v2/vocab.txt → bert/bert-base-japanese-v3/vocab.txt b/bert/bert-large-japanese-v2/vocab.txt → bert/bert-base-japanese-v3/vocab.txt
diff --git a/bert/bert-large-japanese-v2/tokenizer_config.json b/bert/bert-large-japanese-v2/tokenizer_config.json
diff --git a/data_utils.py b/data_utils.py
@@ -154,13 +154,13 @@ def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
 
         if language_str == "ZH":
             bert = bert
-            ja_bert = torch.zeros(1024, len(phone))
+            ja_bert = torch.zeros(768, len(phone))
         elif language_str == "JA":
             ja_bert = bert
             bert = torch.zeros(1024, len(phone))
         else:
             bert = torch.zeros(1024, len(phone))
-            ja_bert = torch.zeros(1024, len(phone))
+            ja_bert = torch.zeros(768, len(phone))
         assert bert.shape[-1] == len(phone), (
             bert.shape,
             len(phone),
@@ -208,13 +208,7 @@ def __call__(self, batch):
             torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True
         )
 
-        max_text_len = max(
-            [
-                batch[ids_sorted_decreasing[i]][7].size(1)
-                for i in range(len(ids_sorted_decreasing))
-            ]
-            + [len(x[0]) for x in batch]
-        )
+        max_text_len = max([len(x[0]) for x in batch])
         max_spec_len = max([x[1].size(1) for x in batch])
         max_wav_len = max([x[2].size(1) for x in batch])
 
@@ -227,7 +221,7 @@ def __call__(self, batch):
         tone_padded = torch.LongTensor(len(batch), max_text_len)
         language_padded = torch.LongTensor(len(batch), max_text_len)
         bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
-        ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
+        ja_bert_padded = torch.FloatTensor(len(batch), 768, max_text_len)
 
         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)

diff --git a/models.py b/models.py
@@ -340,7 +340,7 @@ def __init__(
         self.language_emb = nn.Embedding(num_languages, hidden_channels)
         nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
         self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
-        self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.ja_bert_proj = nn.Conv1d(768, hidden_channels, 1)
 
         self.encoder = attentions.Encoder(
             hidden_channels,

diff --git a/requirements.txt b/requirements.txt
@@ -15,7 +15,7 @@ pypinyin
 cn2an
 gradio
 av
-pyopenjtalk
+mecab-python3
 loguru
 unidic-lite
 cmudict

diff --git a/text/__init__.py b/text/__init__.py
@@ -11,12 +11,7 @@ def cleaned_text_to_sequence(cleaned_text, tones, language):
     Returns:
       List of integers corresponding to the symbols in the text
     """
-    phones = []  # _symbol_to_id[symbol] for symbol in cleaned_text
-    for symbol in cleaned_text:
-        try:
-            phones.append(_symbol_to_id[symbol])
-        except KeyError:
-            phones.append(0)  # symbol not found in ID map, use 0('_') by default
+    phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
     tone_start = language_tone_start_map[language]
     tones = [i + tone_start for i in tones]
     lang_id = language_id_map[language]
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,7 @@ pypinyin @@
     cn2an
     gradio
     av
-    pyopenjtalk
+    mecab-python3
     loguru
     unidic-lite
     cmudict
@@ Expand Down @@