Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TTS] Filter out silent audio files during preprocessing #6716

Merged
merged 1 commit into from
May 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions nemo/collections/tts/parts/preprocessing/audio_trimming.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ def trim_audio(self, audio: np.array, sample_rate: int, audio_id: str = "") -> T
start_frame, end_frame = get_start_and_end_of_speech_frames(
is_speech=speech_frames, speech_frame_threshold=self.speech_frame_threshold, audio_id=audio_id,
)
if not start_frame and not end_frame:
return np.array([]), 0, 0

start_sample = librosa.core.frames_to_samples(start_frame, hop_length=self.trim_hop_length)
end_sample = librosa.core.frames_to_samples(end_frame, hop_length=self.trim_hop_length)
Expand Down Expand Up @@ -170,6 +172,9 @@ def __init__(
self.volume_norm = volume_norm

def _detect_speech(self, audio: np.array) -> np.array:
if audio.shape[0] < self.trim_win_length:
return np.array([])

# [num_frames, win_length]
audio_frames = librosa.util.frame(
audio, frame_length=self.trim_win_length, hop_length=self.trim_hop_length
Expand Down Expand Up @@ -214,6 +219,8 @@ def trim_audio(self, audio: np.array, sample_rate: int, audio_id: str = "") -> T
start_frame, end_frame = get_start_and_end_of_speech_frames(
is_speech=speech_frames, speech_frame_threshold=self.speech_frame_threshold, audio_id=audio_id,
)
if not start_frame and not end_frame:
return np.array([]), 0, 0

if start_frame == 0:
start_sample = 0
Expand Down Expand Up @@ -276,13 +283,10 @@ def get_start_and_end_of_speech_frames(
end_frame = i
break

if start_frame is None:
logging.warning(f"Could not find start of speech for '{audio_id}'")
start_frame = 0

if end_frame is None:
logging.warning(f"Could not find end of speech for '{audio_id}'")
end_frame = num_frames
if start_frame is None or end_frame is None:
# Algorithm is symmetric, so if the start is not found then the end should also not be found.
logging.warning(f"Could not find start or end of speech for '{audio_id}'")
return 0, 0

return start_frame, end_frame

Expand Down
6 changes: 4 additions & 2 deletions scripts/dataset_processing/tts/compute_feature_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ def get_args():
help="Path to output JSON file with dataset feature statistics.",
)
parser.add_argument(
"--overwrite", default=False, type=bool, help="Whether to overwrite the output stats file if it exists.",
"--overwrite",
action=argparse.BooleanOptionalAction,
help="Whether to overwrite the output stats file if it exists.",
)

args = parser.parse_args()
Expand Down Expand Up @@ -132,7 +134,7 @@ def main():

if not feature_dir.exists():
raise ValueError(
f"Feature directory {audio_dir} does not exist. "
f"Feature directory {feature_dir} does not exist. "
f"Please check that the path is correct and that you ran compute_features.py"
)

Expand Down
4 changes: 3 additions & 1 deletion scripts/dataset_processing/tts/create_speaker_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def get_args():
"--speaker_map_path", required=True, type=Path, help="Path for output speaker index JSON",
)
parser.add_argument(
"--overwrite", default=False, type=bool, help="Whether to overwrite the output speaker file if it exists.",
"--overwrite",
action=argparse.BooleanOptionalAction,
help="Whether to overwrite the output speaker file if it exists.",
)
args = parser.parse_args()
return args
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@ def get_args():
parser.add_argument(
"--output_audio_dir", required=True, type=Path, help="Path to output directory for audio files.",
)
parser.add_argument(
"--overwrite_audio",
action=argparse.BooleanOptionalAction,
help="Whether to reprocess and overwrite existing audio files in output_audio_dir.",
)
parser.add_argument(
"--overwrite_manifest",
action=argparse.BooleanOptionalAction,
help="Whether to overwrite the output manifest file if it exists.",
)
parser.add_argument(
"--num_workers", default=1, type=int, help="Number of parallel threads to use. If -1 all CPUs are used."
)
Expand Down Expand Up @@ -110,6 +120,7 @@ def _process_entry(
entry: dict,
input_audio_dir: Path,
output_audio_dir: Path,
overwrite_audio: bool,
audio_trimmer: AudioTrimmer,
output_sample_rate: int,
volume_level: float,
Expand All @@ -120,30 +131,34 @@ def _process_entry(
output_path = output_audio_dir / audio_path_rel
output_path.parent.mkdir(exist_ok=True, parents=True)

audio_path = str(audio_path)
output_path = str(output_path)

audio, sample_rate = librosa.load(audio_path, sr=None)

if audio_trimmer is not None:
audio, start_i, end_i = audio_trimmer.trim_audio(audio=audio, sample_rate=sample_rate, audio_id=audio_path)

if output_sample_rate:
audio = librosa.resample(y=audio, orig_sr=sample_rate, target_sr=output_sample_rate)
sample_rate = output_sample_rate

if volume_level:
audio = normalize_volume(audio, volume_level=volume_level)

sf.write(file=output_path, data=audio, samplerate=sample_rate)

original_duration = librosa.get_duration(filename=audio_path)
output_duration = librosa.get_duration(filename=output_path)
if output_path.exists() and not overwrite_audio:
original_duration = librosa.get_duration(path=audio_path)
output_duration = librosa.get_duration(path=output_path)
else:
audio, sample_rate = librosa.load(audio_path, sr=None)
original_duration = librosa.get_duration(y=audio, sr=sample_rate)
if audio_trimmer is not None:
audio, start_i, end_i = audio_trimmer.trim_audio(
audio=audio, sample_rate=int(sample_rate), audio_id=str(audio_path)
)

if output_sample_rate:
audio = librosa.resample(y=audio, orig_sr=sample_rate, target_sr=output_sample_rate)
sample_rate = output_sample_rate

if volume_level:
audio = normalize_volume(audio, volume_level=volume_level)

if audio.size > 0:
sf.write(file=output_path, data=audio, samplerate=sample_rate)
output_duration = librosa.get_duration(y=audio, sr=sample_rate)
else:
output_duration = 0.0

entry["duration"] = round(output_duration, 2)

if os.path.isabs(audio_filepath):
entry["audio_filepath"] = output_path
entry["audio_filepath"] = str(output_path)

return entry, original_duration, output_duration

Expand All @@ -155,6 +170,8 @@ def main():
output_manifest_path = args.output_manifest
input_audio_dir = args.input_audio_dir
output_audio_dir = args.output_audio_dir
overwrite_audio = args.overwrite_audio
overwrite_manifest = args.overwrite_manifest
num_workers = args.num_workers
max_entries = args.max_entries
output_sample_rate = args.output_sample_rate
Expand All @@ -163,6 +180,12 @@ def main():
max_duration = args.max_duration
filter_file = args.filter_file

if output_manifest_path.exists():
if overwrite_manifest:
print(f"Will overwrite existing manifest path: {output_manifest_path}")
else:
raise ValueError(f"Manifest path already exists: {output_manifest_path}")

if args.trim_config_path:
audio_trimmer_config = OmegaConf.load(args.trim_config_path)
audio_trimmer = instantiate(audio_trimmer_config)
Expand All @@ -181,6 +204,7 @@ def main():
entry=entry,
input_audio_dir=input_audio_dir,
output_audio_dir=output_audio_dir,
overwrite_audio=overwrite_audio,
audio_trimmer=audio_trimmer,
output_sample_rate=output_sample_rate,
volume_level=volume_level,
Expand All @@ -195,7 +219,11 @@ def main():
for output_entry, original_duration, output_duration in job_outputs:
original_durations += original_duration

if (min_duration and output_duration < min_duration) or (max_duration and output_duration > max_duration):
if (
output_duration == 0.0
or (min_duration and output_duration < min_duration)
or (max_duration and output_duration > max_duration)
):
if output_duration != original_duration:
output_entry["original_duration"] = original_duration
filtered_entries.append(output_entry)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_get_start_and_end_of_speech_frames_not_frames_found(self):
)

assert start_frame == 0
assert end_frame == 4
assert end_frame == 0

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
Expand Down
Loading