Remove negative duration segments from whisper (#928)

This PR addresses #891. * Remove segments with non-positive duration from the whisper output * Segment post-processing to force non-overlapping is made optional (disabled by default) * Allow overlapping segments in forced alignment workflow
lhotse-speech · Dec 16, 2022 · 891bad1 · 891bad1
2 parents e83afd3 + f411440
commit 891bad1
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 16 deletions.
diff --git a/lhotse/bin/modes/workflows.py b/lhotse/bin/modes/workflows.py
@@ -57,6 +57,11 @@ def workflows():
     "-d", "--device", default="cpu", help="Device on which to run the inference."
 )
 @click.option("-j", "--jobs", default=1, help="Number of jobs for audio scanning.")
+@click.option(
+    "--force-nonoverlapping/--keep-overlapping",
+    default=False,
+    help="If True, the Whisper segment time-stamps will be processed to make sure they are non-overlapping.",
+)
 def annotate_with_whisper(
     out_cuts: str,
     recordings_manifest: Optional[str],
@@ -67,6 +72,7 @@ def annotate_with_whisper(
     language: Optional[str],
     device: str,
     jobs: int,
+    force_nonoverlapping: bool,
 ):
     """
     Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
@@ -101,6 +107,7 @@ def annotate_with_whisper(
                 language=language,
                 model_name=model_name,
                 device=device,
+                force_nonoverlapping=force_nonoverlapping,
             ),
             total=len(manifest),
             desc="Annotating with Whisper",

diff --git a/lhotse/qa.py b/lhotse/qa.py
@@ -190,7 +190,7 @@ def trim_supervisions_to_recordings(
             continue
         if s.end > end:
             trimmed += 1
-            s = s.trim(recordings[s.recording_id].duration)
+            s = s.trim(end=end)
         sups.append(s)
     if verbose and removed:
         logging.warning(

diff --git a/lhotse/workflows/forced_alignment.py b/lhotse/workflows/forced_alignment.py
@@ -57,10 +57,6 @@ def align_with_torchaudio(
     discard_symbols = _make_discard_symbols_regex(labels)
 
     for cut in cuts:
-        assert not cut.has_overlapping_supervisions, (
-            f"We don't support forced alignment of cuts with overlapping supervisions "
-            f"(cut ID: '{cut.id}')"
-        )
 
         for idx, subcut in enumerate(cut.trim_to_supervisions(keep_overlapping=False)):
             sup = subcut.supervisions[0]

diff --git a/lhotse/workflows/whisper.py b/lhotse/workflows/whisper.py
@@ -20,6 +20,7 @@ def annotate_with_whisper(
     language: Optional[str] = None,
     model_name: str = "base",
     device: str = "cpu",
+    force_nonoverlapping: bool = False,
 ) -> Generator[MonoCut, None, None]:
     """
     Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
@@ -35,6 +36,8 @@ def annotate_with_whisper(
     :param language: specify the language if known upfront, otherwise it will be auto-detected.
     :param model_name: one of available Whisper variants (base, medium, large, etc.).
     :param device: Where to run the inference (cpu, cuda, etc.).
+    :param force_nonoverlapping: if True, the Whisper segment time-stamps will be processed to make
+        sure they are non-overlapping.
     :return: a generator of cuts (use ``CutSet.open_writer()`` to write them).
     """
     assert is_module_available("whisper"), (
@@ -44,15 +47,23 @@ def annotate_with_whisper(
     )
 
     if isinstance(manifest, RecordingSet):
-        yield from _annotate_recordings(manifest, language, model_name, device)
+        yield from _annotate_recordings(
+            manifest, language, model_name, device, force_nonoverlapping
+        )
     elif isinstance(manifest, CutSet):
-        yield from _annotate_cuts(manifest, language, model_name, device)
+        yield from _annotate_cuts(
+            manifest, language, model_name, device, force_nonoverlapping
+        )
     else:
         raise ValueError("The ``manifest`` must be either a RecordingSet or a CutSet.")
 
 
 def _annotate_recordings(
-    recordings: RecordingSet, language: str, model_name: str, device: str
+    recordings: RecordingSet,
+    language: str,
+    model_name: str,
+    device: str,
+    force_nonoverlapping: bool,
 ):
     """
     Helper function that annotates a RecordingSet with Whisper.
@@ -70,6 +81,7 @@ def _annotate_recordings(
             continue
         audio = torch.from_numpy(recording.resample(16000).load_audio()).squeeze(0)
         result = whisper.transcribe(model=model, audio=audio, language=language)
+        # Create supervisions from segments while filtering out those with negative duration.
         supervisions = [
             SupervisionSegment(
                 id=f"{recording.id}-{segment['id']:06d}",
@@ -82,10 +94,15 @@ def _annotate_recordings(
                 language=result["language"],
             )
             for segment in result["segments"]
+            if segment["end"] - segment["start"] > 0
         ]
         cut = recording.to_cut()
         if supervisions:
-            supervisions = _postprocess_timestamps(supervisions)
+            supervisions = (
+                _postprocess_timestamps(supervisions)
+                if force_nonoverlapping
+                else supervisions
+            )
             cut.supervisions = list(
                 trim_supervisions_to_recordings(
                     recordings=recording, supervisions=supervisions, verbose=False
@@ -94,7 +111,13 @@ def _annotate_recordings(
         yield cut
 
 
-def _annotate_cuts(cuts: CutSet, language: str, model_name: str, device: str):
+def _annotate_cuts(
+    cuts: CutSet,
+    language: str,
+    model_name: str,
+    device: str,
+    force_nonoverlapping: bool,
+):
     """
     Helper function that annotates a CutSet with Whisper.
     """
@@ -111,23 +134,29 @@ def _annotate_cuts(cuts: CutSet, language: str, model_name: str, device: str):
             continue
         audio = torch.from_numpy(cut.resample(16000).load_audio()).squeeze(0)
         result = whisper.transcribe(model=model, audio=audio, language=language)
+        # Create supervisions from segments while filtering out those with negative duration.
         supervisions = [
             SupervisionSegment(
                 id=f"{cut.id}-{segment['id']:06d}",
                 recording_id=cut.recording_id,
                 start=round(segment["start"], ndigits=8),
-                duration=max(
-                    cut.duration,
-                    add_durations(
-                        segment["end"], -segment["start"], sampling_rate=16000
-                    ),
+                duration=add_durations(
+                    min(segment["end"], cut.duration),
+                    -segment["start"],
+                    sampling_rate=16000,
                 ),
                 text=segment["text"].strip(),
                 language=result["language"],
             )
             for segment in result["segments"]
+            if segment["end"] - segment["start"] > 0
         ]
-        new_cut = fastcopy(cut, supervisions=_postprocess_timestamps(supervisions))
+        new_cut = fastcopy(
+            cut,
+            supervisions=_postprocess_timestamps(supervisions)
+            if force_nonoverlapping
+            else supervisions,
+        )
         yield new_cut
 
 
@@ -139,6 +168,8 @@ def _postprocess_timestamps(supervisions: List[SupervisionSegment]):
     """
     from cytoolz import sliding_window
 
+    supervisions = sorted(supervisions, key=lambda s: s.start)
+
     if len(supervisions) < 2:
         return supervisions
     out = []