Skip to content

Commit

Permalink
[text] fix bpe model in multiprocess env
Browse files Browse the repository at this point in the history
  • Loading branch information
Mddct committed Nov 27, 2023
1 parent 301af9e commit aab5fde
Showing 1 changed file with 12 additions and 3 deletions.
15 changes: 12 additions & 3 deletions wenet/text/bpe_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,19 @@ def __init__(
) -> None:
super().__init__(symbol_table, non_lang_syms, split_with_space,
connect_symbol, unk)
import sentencepiece as spm
self.bpe_model = spm.SentencePieceProcessor()
self.bpe_model.load(bpe_model)
# NOTE(Mddct): multiprocessing.Process() issues
# see: https://github.com/espnet/espnet/blob/master/espnet2/text/sentencepiece_tokenizer.py#L19
self._model = bpe_model
self.bpe_model = None

def _build_sp(self):
if self.bpe_model is None:
import sentencepiece as spm
self.bpe_model = spm.SentencePieceProcessor()
self.bpe_model.load(self._model)

def text2tokens(self, line: str) -> List[str]:
self._build_sp()
line = line.strip()
if self.non_lang_syms_pattern is not None:
parts = self.non_lang_syms_pattern.split(line.upper())
Expand All @@ -38,5 +46,6 @@ def text2tokens(self, line: str) -> List[str]:
return tokens

def tokens2text(self, tokens: List[str]) -> str:
self._build_sp()
text = super().tokens2text(tokens)
return text.replace("▁", ' ').strip()

0 comments on commit aab5fde

Please sign in to comment.