[TTS] Add dataset to path of logged artifacts (NVIDIA#7462)

* [TTS] Add dataset to path of logged artifacts Signed-off-by: Ryan <[email protected]> * [TTS] Revert axis name back to Audio Frames Signed-off-by: Ryan <[email protected]> --------- Signed-off-by: Ryan <[email protected]> Signed-off-by: Sasha Meister <[email protected]>
ssh-meister · Oct 5, 2023 · d22b4d1 · d22b4d1
1 parent 1bd4bd0
commit d22b4d1
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 61 deletions.
diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py
@@ -46,6 +46,7 @@ class DatasetMeta:
 
 @dataclass
 class DatasetSample:
+    dataset_name: str
     manifest_entry: Dict[str, Any]
     audio_dir: Path
     feature_dir: Path
@@ -180,6 +181,7 @@ def _preprocess_manifest(
                 speaker_index = 0
 
             sample = DatasetSample(
+                dataset_name=dataset_name,
                 manifest_entry=entry,
                 audio_dir=Path(dataset.audio_dir),
                 feature_dir=Path(dataset.feature_dir),
@@ -204,7 +206,12 @@ def __getitem__(self, index):
         audio, _ = librosa.load(audio_filepath_abs, sr=self.sample_rate)
         tokens = self.text_tokenizer(data.text)
 
-        example = {"audio_filepath": audio_filepath_rel, "audio": audio, "tokens": tokens}
+        example = {
+            "dataset_name": data.dataset_name,
+            "audio_filepath": audio_filepath_rel,
+            "audio": audio,
+            "tokens": tokens,
+        }
 
         if data.speaker is not None:
             example["speaker"] = data.speaker
@@ -229,6 +236,7 @@ def __getitem__(self, index):
         return example
 
     def collate_fn(self, batch: List[dict]):
+        dataset_name_list = []
         audio_filepath_list = []
         audio_list = []
         audio_len_list = []
@@ -238,6 +246,7 @@ def collate_fn(self, batch: List[dict]):
         prior_list = []
 
         for example in batch:
+            dataset_name_list.append(example["dataset_name"])
             audio_filepath_list.append(example["audio_filepath"])
 
             audio_tensor = torch.tensor(example["audio"], dtype=torch.float32)
@@ -264,6 +273,7 @@ def collate_fn(self, batch: List[dict]):
         batch_tokens = stack_tensors(token_list, max_lens=[token_max_len], pad_value=self.text_tokenizer.pad)
 
         batch_dict = {
+            "dataset_names": dataset_name_list,
             "audio_filepaths": audio_filepath_list,
             "audio": batch_audio,
             "audio_lens": batch_audio_len,

diff --git a/nemo/collections/tts/data/vocoder_dataset.py b/nemo/collections/tts/data/vocoder_dataset.py
@@ -43,6 +43,7 @@ class DatasetMeta:
 
 @dataclass
 class DatasetSample:
+    dataset_name: str
     manifest_entry: dict
     audio_dir: Path
 
@@ -165,7 +166,7 @@ def _preprocess_manifest(
         samples = []
         sample_weights = []
         for entry in filtered_entries:
-            sample = DatasetSample(manifest_entry=entry, audio_dir=Path(dataset.audio_dir),)
+            sample = DatasetSample(dataset_name=dataset_name, manifest_entry=entry, audio_dir=Path(dataset.audio_dir))
             samples.append(sample)
             sample_weights.append(dataset.sample_weight)
 
@@ -182,19 +183,26 @@ def __getitem__(self, index):
 
         audio, audio_len = self._sample_audio(audio_filepath_abs)
 
-        example = {"audio_filepath": audio_filepath_rel, "audio": audio, "audio_len": audio_len}
+        example = {
+            "dataset_name": data.dataset_name,
+            "audio_filepath": audio_filepath_rel,
+            "audio": audio,
+            "audio_len": audio_len,
+        }
 
         for processor in self.feature_processors:
             processor.process(example)
 
         return example
 
     def collate_fn(self, batch: List[dict]):
+        dataset_name_list = []
         audio_filepath_list = []
         audio_list = []
         audio_len_list = []
 
         for example in batch:
+            dataset_name_list.append(example["dataset_name"])
             audio_filepath_list.append(example["audio_filepath"])
             audio_list.append(example["audio"])
             audio_len_list.append(example["audio_len"])
@@ -205,6 +213,7 @@ def collate_fn(self, batch: List[dict]):
         batch_audio = stack_tensors(audio_list, max_lens=[audio_max_len])
 
         batch_dict = {
+            "dataset_names": dataset_name_list,
             "audio_filepaths": audio_filepath_list,
             "audio": batch_audio,
             "audio_lens": batch_audio_len,