Skip to content

Commit

Permalink
[TTS] Add dataset to path of logged artifacts (NVIDIA#7462)
Browse files Browse the repository at this point in the history
* [TTS] Add dataset to path of logged artifacts

Signed-off-by: Ryan <[email protected]>

* [TTS] Revert axis name back to Audio Frames

Signed-off-by: Ryan <[email protected]>

---------

Signed-off-by: Ryan <[email protected]>
Signed-off-by: Sasha Meister <[email protected]>
  • Loading branch information
rlangman authored and ssh-meister committed Oct 5, 2023
1 parent 1bd4bd0 commit d22b4d1
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 61 deletions.
12 changes: 11 additions & 1 deletion nemo/collections/tts/data/text_to_speech_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class DatasetMeta:

@dataclass
class DatasetSample:
dataset_name: str
manifest_entry: Dict[str, Any]
audio_dir: Path
feature_dir: Path
Expand Down Expand Up @@ -180,6 +181,7 @@ def _preprocess_manifest(
speaker_index = 0

sample = DatasetSample(
dataset_name=dataset_name,
manifest_entry=entry,
audio_dir=Path(dataset.audio_dir),
feature_dir=Path(dataset.feature_dir),
Expand All @@ -204,7 +206,12 @@ def __getitem__(self, index):
audio, _ = librosa.load(audio_filepath_abs, sr=self.sample_rate)
tokens = self.text_tokenizer(data.text)

example = {"audio_filepath": audio_filepath_rel, "audio": audio, "tokens": tokens}
example = {
"dataset_name": data.dataset_name,
"audio_filepath": audio_filepath_rel,
"audio": audio,
"tokens": tokens,
}

if data.speaker is not None:
example["speaker"] = data.speaker
Expand All @@ -229,6 +236,7 @@ def __getitem__(self, index):
return example

def collate_fn(self, batch: List[dict]):
dataset_name_list = []
audio_filepath_list = []
audio_list = []
audio_len_list = []
Expand All @@ -238,6 +246,7 @@ def collate_fn(self, batch: List[dict]):
prior_list = []

for example in batch:
dataset_name_list.append(example["dataset_name"])
audio_filepath_list.append(example["audio_filepath"])

audio_tensor = torch.tensor(example["audio"], dtype=torch.float32)
Expand All @@ -264,6 +273,7 @@ def collate_fn(self, batch: List[dict]):
batch_tokens = stack_tensors(token_list, max_lens=[token_max_len], pad_value=self.text_tokenizer.pad)

batch_dict = {
"dataset_names": dataset_name_list,
"audio_filepaths": audio_filepath_list,
"audio": batch_audio,
"audio_lens": batch_audio_len,
Expand Down
13 changes: 11 additions & 2 deletions nemo/collections/tts/data/vocoder_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class DatasetMeta:

@dataclass
class DatasetSample:
dataset_name: str
manifest_entry: dict
audio_dir: Path

Expand Down Expand Up @@ -165,7 +166,7 @@ def _preprocess_manifest(
samples = []
sample_weights = []
for entry in filtered_entries:
sample = DatasetSample(manifest_entry=entry, audio_dir=Path(dataset.audio_dir),)
sample = DatasetSample(dataset_name=dataset_name, manifest_entry=entry, audio_dir=Path(dataset.audio_dir))
samples.append(sample)
sample_weights.append(dataset.sample_weight)

Expand All @@ -182,19 +183,26 @@ def __getitem__(self, index):

audio, audio_len = self._sample_audio(audio_filepath_abs)

example = {"audio_filepath": audio_filepath_rel, "audio": audio, "audio_len": audio_len}
example = {
"dataset_name": data.dataset_name,
"audio_filepath": audio_filepath_rel,
"audio": audio,
"audio_len": audio_len,
}

for processor in self.feature_processors:
processor.process(example)

return example

def collate_fn(self, batch: List[dict]):
dataset_name_list = []
audio_filepath_list = []
audio_list = []
audio_len_list = []

for example in batch:
dataset_name_list.append(example["dataset_name"])
audio_filepath_list.append(example["audio_filepath"])
audio_list.append(example["audio"])
audio_len_list.append(example["audio_len"])
Expand All @@ -205,6 +213,7 @@ def collate_fn(self, batch: List[dict]):
batch_audio = stack_tensors(audio_list, max_lens=[audio_max_len])

batch_dict = {
"dataset_names": dataset_name_list,
"audio_filepaths": audio_filepath_list,
"audio": batch_audio,
"audio_lens": batch_audio_len,
Expand Down
Loading

0 comments on commit d22b4d1

Please sign in to comment.