Skip to content

Commit

Permalink
Remap speakers to continuous range of speaker_id for dataset AISHELL3 (
Browse files Browse the repository at this point in the history
…NVIDIA#7536)

* Remap speakers to continuous range of speaker_id for dataset AISHELL3
* Add new key/value pair to record raw speaker for AISHELL3 dataset

Signed-off-by: Robin Dong <[email protected]>

---------

Signed-off-by: Robin Dong <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Sasha Meister <[email protected]>
  • Loading branch information
2 people authored and ssh-meister committed Oct 5, 2023
1 parent cf6f95f commit 9b19b68
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ model:
learn_alignment: true
bin_loss_warmup_epochs: 100

n_speakers: 1958
n_speakers: 175
max_token_duration: 75
symbols_embedding_dim: 384
pitch_embedding_kernel_size: 3
Expand Down
38 changes: 25 additions & 13 deletions scripts/dataset_processing/tts/aishell3/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,17 @@ def __process_transcript(file_path: str):
text_normalizer_call_kwargs = {"punct_pre_process": True, "punct_post_process": True}
normalizer_call = lambda x: text_normalizer.normalize(x, **text_normalizer_call_kwargs)
entries = []
i = 0
SPEAKER_LEN = 7

candidates = []
speakers = set()
with open(file_path / "train" / "content.txt", encoding="utf-8") as fin:
for line in fin:
content = line.split()
wav_name, text = content[0], "".join(content[1::2]) + "。"
wav_name = wav_name.replace(u'\ufeff', '')
speaker = wav_name[:SPEAKER_LEN]
speakers.add(speaker)
wav_file = file_path / "train" / "wav" / speaker / wav_name
assert os.path.exists(wav_file), f"{wav_file} not found!"
duration = subprocess.check_output(f"soxi -D {wav_file}", shell=True)
Expand All @@ -102,18 +105,27 @@ def __process_transcript(file_path: str):
processed_file = file_path / "processed" / wav_name
# convert wav to mono 22050HZ, 16 bit (as SFSpeech dataset)
subprocess.run(f"sox {wav_file} -r 22050 -c 1 -b 16 {processed_file}", shell=True)
simplified_text = cc.convert(text)
normalized_text = normalizer_call(simplified_text)
entry = {
'audio_filepath': os.path.abspath(processed_file),
'duration': float(duration),
'text': text,
'normalized_text': normalized_text,
'speaker': int(speaker[3:]),
}

i += 1
entries.append(entry)
candidates.append((processed_file, duration, text, speaker))

# remapping the speakder to speaker_id (start from 1)
remapping = {}
for index, speaker in enumerate(sorted(speakers)):
remapping[speaker] = index + 1

for processed_file, duration, text, speaker in candidates:
simplified_text = cc.convert(text)
normalized_text = normalizer_call(simplified_text)
entry = {
'audio_filepath': os.path.abspath(processed_file),
'duration': float(duration),
'text': text,
'normalized_text': normalized_text,
'speaker_raw': speaker,
'speaker': remapping[speaker],
}

entries.append(entry)

return entries


Expand Down

0 comments on commit 9b19b68

Please sign in to comment.