NVIDIA · stevehuang52 · Jun 26, 2023 · Jun 21, 2023 · Jun 22, 2023 · Jun 22, 2023
diff --git a/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml b/examples/asr/conf/vad/frame_vad_infer_postprocess.yaml
@@ -21,10 +21,10 @@ vad:
     postprocessing:
       onset: 0.3 # onset threshold for detecting the beginning and end of a speech
       offset: 0.3 # offset threshold for detecting the end of a speech.
-      pad_onset: 0.5 # adding durations before each speech segment
-      pad_offset: 0.5 # adding durations after each speech segment
-      min_duration_on: 0.0 # threshold for short speech deletion
-      min_duration_off: 0.6 # threshold for short non-speech segment deletion
+      pad_onset: 0.2 # adding durations before each speech segment
+      pad_offset: 0.2 # adding durations after each speech segment
+      min_duration_on: 0.2 # threshold for short speech deletion
+      min_duration_off: 0.2 # threshold for short non-speech segment deletion
       filter_speech_first: True
 
 prepared_manifest_vad_input: null # if not specify, it will automatically generated be "manifest_vad_input.json"

diff --git a/examples/asr/speech_classification/README.md b/examples/asr/speech_classification/README.md
@@ -86,3 +86,20 @@ The manifest json file should have the following format (each line is a Python d
 {"audio_filepath": "/path/to/audio_file1.wav", "offset": 0, "duration": 10000}  
 {"audio_filepath": "/path/to/audio_file2.wav", "offset": 0, "duration": 10000}  
 ```
+
+
+## Visualization
+
+To visualize the VAD outputs, you can use the `nemo.collections.asr.parts.utils.vad_utils.plot_sample_from_rttm` function, which takes an audio file and an RTTM file as input, and plots the audio waveform and the VAD labels. Since the VAD inference script will output a json manifest `manifest_vad_out.json` by default, you can create a Jupyter Notebook with the following script and fill in the paths using the output manifest:
+```python
+from nemo.collections.asr.parts.utils.vad_utils import plot_sample_from_rttm
+
+plot_sample_from_rttm(
+    audio_file="/path/to/audio_file.wav",
+    rttm_file="/path/to/rttm_file.rttm",
+    offset=0.0,
+    duration=1000,
+    save_path="vad_pred.png"
+)
+```
+
diff --git a/nemo/collections/asr/parts/utils/vad_utils.py b/nemo/collections/asr/parts/utils/vad_utils.py
@@ -1648,7 +1648,7 @@ def frame_vad_infer_load_manifest(cfg: DictConfig):
             manifest_orig.append(entry)
 
             # always prefer RTTM labels if exist
-            if "label" not in entry or "rttm_filepath" in entry or "rttm_file" in entry:
+            if "label" not in entry and ("rttm_filepath" in entry or "rttm_file" in entry):
                 rttm_key = "rttm_filepath" if "rttm_filepath" in entry else "rttm_file"
                 segments = load_speech_segments_from_rttm(entry[rttm_key])
                 label_str = get_frame_labels(
@@ -1661,8 +1661,8 @@ def frame_vad_infer_load_manifest(cfg: DictConfig):
                 key_labels_map[uniq_audio_name] = [float(x) for x in label_str.split()]
             elif entry.get("label", None) is not None:
                 key_labels_map[uniq_audio_name] = [float(x) for x in entry["label"].split()]
-            else:
-                raise ValueError("Must have either `label` or `rttm_filepath` in manifest")
+            elif cfg.evaluate:
+                raise ValueError("Must have either `label` or `rttm_filepath` in manifest when evaluate=True")
 
     return manifest_orig, key_labels_map, key_rttm_map