Fix bugs in MixedCut logic (#1073)

* Fix .truncate() for MixedCut for the case when no tracks overlap with the requested region * Provide base audio offset to AudioMixer where necessary * Test for truncating between tracks in MixedCut * Add test on mixed cut loading with first track offseted --------- Co-authored-by: Piotr Żelasko <[email protected]>
lhotse-speech · May 26, 2023 · d430776 · d430776
1 parent 4ecbb02
commit d430776
Show file tree

Hide file tree

Showing 5 changed files with 133 additions and 2 deletions.
diff --git a/lhotse/audio.py b/lhotse/audio.py
@@ -1308,6 +1308,7 @@ def __init__(
         base_audio: np.ndarray,
         sampling_rate: int,
         reference_energy: Optional[float] = None,
+        base_offset: Seconds = 0.0,
     ):
         """
         AudioMixer's constructor.
@@ -1317,9 +1318,10 @@ def __init__(
         :param sampling_rate: Sampling rate of the audio.
         :param reference_energy: Optionally pass a reference energy value to compute SNRs against.
             This might be required when ``base_audio`` corresponds to zero-padding.
+        :param base_offset: Optionally pass a time offset for the base signal.
         """
         self.tracks = [base_audio]
-        self.offsets = [0]
+        self.offsets = [compute_num_samples(base_offset, sampling_rate)]
         self.sampling_rate = sampling_rate
         self.num_channels = base_audio.shape[0]
         self.dtype = self.tracks[0].dtype

diff --git a/lhotse/cut/mixed.py b/lhotse/cut/mixed.py
@@ -490,6 +490,17 @@ def truncate(
                     snr=track.snr,
                 )
             )
+
+        # Edge case: no tracks left after truncation. This can happen if we truncated an offset region.
+        # In this case, return a PaddingCut of the requested duration
+        if len(new_tracks) == 0:
+            return PaddingCut(
+                id=self.id if preserve_id else str(uuid4()),
+                duration=duration,
+                sampling_rate=self.sampling_rate,
+                feat_value=0.0,
+            )
+
         if len(new_tracks) == 1:
             # The truncation resulted in just a single cut - simply return it.
             return new_tracks[0].cut
@@ -1047,6 +1058,7 @@ def load_audio(
             self.tracks[0].cut.load_audio(),
             sampling_rate=self.tracks[0].cut.sampling_rate,
             reference_energy=reference_energy,
+            base_offset=self.tracks[0].offset,
         )
 
         for pos, track in enumerate(self.tracks[1:], start=1):

diff --git a/test/cut/test_cut_mixing.py b/test/cut/test_cut_mixing.py
@@ -180,6 +180,16 @@ def mixed_audio_cut() -> MixedCut:
     return mixed_cut
 
 
+@pytest.fixture
+def offseted_mixed_audio_cut() -> MixedCut:
+    cut_set = CutSet.from_json(
+        "test/fixtures/mix_cut_test/offseted_audio_cut_manifest.json"
+    )
+    mixed_cut = cut_set["mixed-cut-id"]
+    assert isclose(mixed_cut.duration, 16.66)
+    return mixed_cut
+
+
 def test_mixed_cut_load_audio_mixed(mixed_audio_cut):
     audio = mixed_audio_cut.load_audio()
     assert audio.shape == (1, 230400)
@@ -193,6 +203,11 @@ def test_mixed_cut_load_audio_unmixed(mixed_audio_cut):
     assert audio[1].shape == (1, 230400)
 
 
+def test_mixed_cut_load_offseted_mixed(offseted_mixed_audio_cut):
+    audio = offseted_mixed_audio_cut.load_audio()
+    assert audio.shape == (1, 266560)
+
+
 @pytest.mark.parametrize(
     "mixed, mono_downmix",
     [

diff --git a/test/cut/test_cut_truncate.py b/test/cut/test_cut_truncate.py
@@ -4,7 +4,7 @@
 import pytest
 
 from lhotse import RecordingSet
-from lhotse.cut import CutSet, MixedCut, MixTrack, MonoCut
+from lhotse.cut import CutSet, MixedCut, MixTrack, MonoCut, PaddingCut
 from lhotse.features import Features
 from lhotse.supervision import SupervisionSegment, SupervisionSet
 from lhotse.testing.dummies import DummyManifest, dummy_cut, dummy_recording
@@ -139,6 +139,17 @@ def simple_mixed_cut():
     )
 
 
+@pytest.fixture
+def gapped_mixed_cut():
+    return MixedCut(
+        id="gapped-mixed-cut",
+        tracks=[
+            MixTrack(cut=dummy_cut(0, duration=10.0)),
+            MixTrack(cut=dummy_cut(1, duration=10.0), offset=15.0),
+        ],
+    )
+
+
 def test_truncate_mixed_cut_without_args(simple_mixed_cut):
     truncated_cut = simple_mixed_cut.truncate()
     assert truncated_cut.duration == 15.0
@@ -214,6 +225,14 @@ def test_truncate_mixed_cut_with_small_offset_and_duration(simple_mixed_cut):
     assert truncated_cut.duration == 13.0
 
 
+def test_truncate_mixed_cut_inside_gap(gapped_mixed_cut):
+    truncated_cut = gapped_mixed_cut.truncate(offset=11.0, duration=3.0)
+    assert isinstance(truncated_cut, PaddingCut)
+    assert truncated_cut.start == 0.0
+    assert truncated_cut.duration == 3.0
+    assert truncated_cut.end == 3.0
+
+
 def test_truncate_cut_set_offset_start(cut_set):
     truncated_cut_set = cut_set.truncate(max_duration=5, offset_type="start")
     cut1, cut2 = truncated_cut_set

diff --git a/test/fixtures/mix_cut_test/offseted_audio_cut_manifest.json b/test/fixtures/mix_cut_test/offseted_audio_cut_manifest.json
@@ -0,0 +1,83 @@
+[
+  {
+    "id": "mixed-cut-id",
+    "tracks": [
+      {
+        "cut": {
+          "channel": 0,
+          "id": "ab34ea6b-9d68-4113-8386-b4c585e596f5",
+          "duration": 11.66,
+          "recording": {
+            "duration": 11.66,
+            "id": "2412-153948-0000",
+            "num_samples": 186560,
+            "sampling_rate": 16000,
+            "sources": [
+              {
+                "channels": [
+                  0
+                ],
+                "source": "test/fixtures/mix_cut_test/audio/storage/2412-153948-0000.flac",
+                "type": "file"
+              }
+            ]
+          },
+          "start": 0,
+          "supervisions": [
+            {
+              "channel": 0,
+              "duration": 11.66,
+              "id": "2412-153948-0000",
+              "language": "English",
+              "recording_id": "2412-153948-0000",
+              "speaker": "2412",
+              "start": 0.0,
+              "text": "IF THE READER WILL EXCUSE ME I WILL SAY NOTHING OF MY ANTECEDENTS NOR OF THE CIRCUMSTANCES WHICH LED ME TO LEAVE MY NATIVE COUNTRY THE NARRATIVE WOULD BE TEDIOUS TO HIM AND PAINFUL TO MYSELF"
+            }
+          ]
+        },
+        "type": "MonoCut",
+        "offset": 5.0
+      },
+      {
+        "cut": {
+          "channel": 0,
+          "id": "07d009a9-e7a1-4611-aa21-32ff9b43dff0",
+          "duration": 10.51,
+          "recording": {
+            "duration": 10.51,
+            "id": "2412-153948-0001",
+            "num_samples": 168160,
+            "sampling_rate": 16000,
+            "sources": [
+              {
+                "channels": [
+                  0
+                ],
+                "source": "test/fixtures/mix_cut_test/audio/storage/2412-153948-0001.flac",
+                "type": "file"
+              }
+            ]
+          },
+          "start": 0,
+          "supervisions": [
+            {
+              "channel": 0,
+              "duration": 4.65,
+              "id": "174-168635-0001",
+              "language": "English",
+              "recording_id": "174-168635-0001",
+              "speaker": "174",
+              "start": 0.0,
+              "text": "THE HEART OF THAT EX CONVICT WAS FULL OF VIRGINITY"
+            }
+          ]
+        },
+        "type": "MonoCut",
+        "offset": 5.02,
+        "snr": 20.0
+      }
+    ],
+    "type": "MixedCut"
+  }
+]