Addressed PR comments

Signed-off-by: Ante Jukić <[email protected]>
NVIDIA · Jun 30, 2024 · c79d0c6 · c79d0c6
1 parent 43aceb2
commit c79d0c6
Show file tree

Hide file tree

Showing 11 changed files with 541 additions and 555 deletions.
diff --git a/examples/audio/speech_enhancement.py → examples/audio/audio_to_audio_train.py b/examples/audio/speech_enhancement.py → examples/audio/audio_to_audio_train.py
@@ -16,7 +16,7 @@
 # Training the model
 
 Basic run (on CPU for 50 epochs):
-    python examples/audio/speech_enhancement.py \
+    python examples/audio/audio_to_audio_train.py \
         # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
         model.train_ds.manifest_filepath="<path to manifest file>" \
         model.validation_ds.manifest_filepath="<path to manifest file>" \

diff --git a/examples/audio/conf/predictive.yaml b/examples/audio/conf/predictive.yaml
@@ -43,7 +43,7 @@ model:
     scale: ${model.encoder.scale}
 
   estimator:
-    _target_: nemo.collections.audio.parts.submodules.diffusion.SpectrogramNoiseConditionalScoreNetworkPlusPlus
+    _target_: nemo.collections.audio.parts.submodules.ncsnpp.SpectrogramNoiseConditionalScoreNetworkPlusPlus
     in_channels: 1 # single-channel noisy input
     out_channels: 1 # single-channel estimate
     num_res_blocks: 3 # increased number of res blocks

diff --git a/examples/audio/conf/score_based_generative.yaml b/examples/audio/conf/score_based_generative.yaml
@@ -45,7 +45,7 @@ model:
     scale: ${model.encoder.scale}
 
   estimator:
-    _target_: nemo.collections.audio.parts.submodules.diffusion.SpectrogramNoiseConditionalScoreNetworkPlusPlus
+    _target_: nemo.collections.audio.parts.submodules.ncsnpp.SpectrogramNoiseConditionalScoreNetworkPlusPlus
     in_channels: 2 # concatenation of single-channel perturbed and noisy
     out_channels: 1 # single-channel score estimate
     conditioned_on_time: true

diff --git a/nemo/collections/asr/parts/preprocessing/segment.py b/nemo/collections/asr/parts/preprocessing/segment.py
@@ -122,6 +122,28 @@ def select_channels(signal: npt.NDArray, channel_selector: Optional[ChannelSelec
     return signal
 
 
+def get_samples(audio_file: str, target_sr: int = 16000, dtype: str = 'float32'):
+    """
+    Read the samples from the given audio_file path. If not specified, the input audio file is automatically
+    resampled to 16kHz.
+
+    Args:
+        audio_file (str):
+            Path to the input audio file
+        target_sr (int):
+            Targeted sampling rate
+    Returns:
+        samples (numpy.ndarray):
+            Time-series sample data from the given audio file
+    """
+    with sf.SoundFile(audio_file, 'r') as f:
+        samples = f.read(dtype=dtype)
+        if f.samplerate != target_sr:
+            samples = librosa.core.resample(samples, orig_sr=f.samplerate, target_sr=target_sr)
+        samples = samples.transpose()
+    return samples
+
+
 class AudioSegment(object):
     """Audio segment abstraction.
     :param samples: Audio samples [num_samples x num_channels].

diff --git a/nemo/collections/asr/parts/utils/audio_utils.py b/nemo/collections/asr/parts/utils/audio_utils.py
diff --git a/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py b/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py
@@ -29,7 +29,7 @@
     CTCDecoding,
     CTCDecodingConfig,
 )
-from nemo.collections.asr.parts.utils.audio_utils import get_samples
+from nemo.collections.asr.parts.preprocessing.segment import get_samples
 from nemo.collections.asr.parts.utils.speaker_utils import audio_rttm_map, get_uniqname_from_filepath
 from nemo.collections.asr.parts.utils.streaming_utils import AudioFeatureIterator, FrameBatchASR
 from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec

diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -24,7 +24,7 @@
 from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
 from nemo.collections.asr.parts.mixins.streaming import StreamingEncoder
 from nemo.collections.asr.parts.preprocessing.features import normalize_batch
-from nemo.collections.asr.parts.utils.audio_utils import get_samples
+from nemo.collections.asr.parts.preprocessing.segment import get_samples
 from nemo.core.classes import IterableDataset
 from nemo.core.neural_types import LengthsType, MelSpectrogramType, NeuralType