[TTS] Read audio as int32 to avoid flac read errors (NVIDIA#7477)

* [TTS] Read audio as int32 to avoid flac read errors Signed-off-by: Ryan <[email protected]> * [TTS] Add comment about read failures Signed-off-by: Ryan <[email protected]> --------- Signed-off-by: Ryan <[email protected]> Signed-off-by: Sasha Meister <[email protected]>
ssh-meister · Oct 10, 2023 · 92f0eec · 92f0eec
1 parent 9c83365
commit 92f0eec
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 6 deletions.
diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py
@@ -375,7 +375,15 @@ def from_file_list(
 
     @classmethod
     def segment_from_file(
-        cls, audio_file, target_sr=None, n_segments=0, trim=False, orig_sr=None, channel_selector=None, offset=None
+        cls,
+        audio_file,
+        target_sr=None,
+        n_segments=0,
+        trim=False,
+        orig_sr=None,
+        channel_selector=None,
+        offset=None,
+        dtype='float32',
     ):
         """Grabs n_segments number of samples from audio_file.
         If offset is not provided, n_segments are selected randomly.
@@ -390,6 +398,7 @@ def segment_from_file(
         :param orig_sr: the original sample rate
         :param channel selector: select a subset of channels. If set to `None`, the original signal will be used.
         :param offset: fixed offset in seconds
+        :param dtype: data type to load audio as.
         :return: numpy array of samples
         """
         is_segmented = False
@@ -412,15 +421,15 @@ def segment_from_file(
                                 f'Provided audio start ({audio_start}) is larger than the maximum possible ({max_audio_start})'
                             )
                     f.seek(audio_start)
-                    samples = f.read(n_segments_at_original_sr, dtype='float32')
+                    samples = f.read(n_segments_at_original_sr, dtype=dtype)
                     is_segmented = True
                 elif n_segments_at_original_sr > len(f):
                     logging.warning(
                         f"Number of segments ({n_segments_at_original_sr}) is greater than the length ({len(f)}) of the audio file {audio_file}. This may lead to shape mismatch errors."
                     )
-                    samples = f.read(dtype='float32')
+                    samples = f.read(dtype=dtype)
                 else:
-                    samples = f.read(dtype='float32')
+                    samples = f.read(dtype=dtype)
         except RuntimeError as e:
             logging.error(f"Loading {audio_file} via SoundFile raised RuntimeError: `{e}`.")
             raise e

diff --git a/nemo/collections/tts/data/vocoder_dataset.py b/nemo/collections/tts/data/vocoder_dataset.py
@@ -122,11 +122,13 @@ def get_sampler(self, batch_size: int) -> Optional[torch.utils.data.Sampler]:
         return sampler
 
     def _segment_audio(self, audio_filepath: Path) -> AudioSegment:
-        # Retry file read multiple times as file seeking can produce random IO errors.
+        # File seeking sometimes fails when reading flac files with libsndfile < 1.0.30.
+        # Read audio as int32 to minimize issues, and retry read on a different segment in case of failure.
+        # https://github.com/bastibe/python-soundfile/issues/274
         for _ in range(self.num_audio_retries):
             try:
                 audio_segment = AudioSegment.segment_from_file(
-                    audio_filepath, target_sr=self.sample_rate, n_segments=self.n_samples,
+                    audio_filepath, target_sr=self.sample_rate, n_segments=self.n_samples, dtype="int32"
                 )
                 return audio_segment
             except Exception: