From 981c34b6e592f39e442174abcd9665a817b9c2fe Mon Sep 17 00:00:00 2001 From: Paarth Neekhara Date: Fri, 1 Dec 2023 14:04:32 -0500 Subject: [PATCH] use phoneme tokenizer for edit speech Signed-off-by: Paarth Neekhara --- .../data/language_modeling/megatron/t5_speechlm_dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_speechlm_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_speechlm_dataset.py index 70cbc4670078..b3d2fd2969d8 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/t5_speechlm_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_speechlm_dataset.py @@ -647,6 +647,11 @@ def _get_tokens(self, doc, field, field_data): instruction_tokens = self._get_text_tokens("Phoneme TTS") field_tokens = self._get_phoneme_tokens(_text.replace("Phoneme TTS ", "")) field_tokens = instruction_tokens + field_tokens + elif _text.startswith("Edit Speech"): + # Always use phoneme tokenizer for edit speech + instruction_tokens = self._get_text_tokens("Edit Speech") + field_tokens = self._get_phoneme_tokens(_text.replace("Edit Speech ", "")) + field_tokens = instruction_tokens + field_tokens else: field_tokens = self._get_text_tokens(field_data.strip(" ")) # list of ids elif doc[f"{field}_type"] == 'SPEECH':