Skip to content

Commit

Permalink
Fix bugs in MixedCut logic (#1073)
Browse files Browse the repository at this point in the history
* Fix .truncate() for MixedCut for the case when no tracks overlap with the requested region

* Provide base audio offset to AudioMixer where necessary

* Test for truncating between tracks in MixedCut

* Add test on mixed cut loading with first track offseted

---------

Co-authored-by: Piotr Żelasko <[email protected]>
  • Loading branch information
flyingleafe and pzelasko authored May 26, 2023
1 parent 4ecbb02 commit d430776
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 2 deletions.
4 changes: 3 additions & 1 deletion lhotse/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -1308,6 +1308,7 @@ def __init__(
base_audio: np.ndarray,
sampling_rate: int,
reference_energy: Optional[float] = None,
base_offset: Seconds = 0.0,
):
"""
AudioMixer's constructor.
Expand All @@ -1317,9 +1318,10 @@ def __init__(
:param sampling_rate: Sampling rate of the audio.
:param reference_energy: Optionally pass a reference energy value to compute SNRs against.
This might be required when ``base_audio`` corresponds to zero-padding.
:param base_offset: Optionally pass a time offset for the base signal.
"""
self.tracks = [base_audio]
self.offsets = [0]
self.offsets = [compute_num_samples(base_offset, sampling_rate)]
self.sampling_rate = sampling_rate
self.num_channels = base_audio.shape[0]
self.dtype = self.tracks[0].dtype
Expand Down
12 changes: 12 additions & 0 deletions lhotse/cut/mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,17 @@ def truncate(
snr=track.snr,
)
)

# Edge case: no tracks left after truncation. This can happen if we truncated an offset region.
# In this case, return a PaddingCut of the requested duration
if len(new_tracks) == 0:
return PaddingCut(
id=self.id if preserve_id else str(uuid4()),
duration=duration,
sampling_rate=self.sampling_rate,
feat_value=0.0,
)

if len(new_tracks) == 1:
# The truncation resulted in just a single cut - simply return it.
return new_tracks[0].cut
Expand Down Expand Up @@ -1047,6 +1058,7 @@ def load_audio(
self.tracks[0].cut.load_audio(),
sampling_rate=self.tracks[0].cut.sampling_rate,
reference_energy=reference_energy,
base_offset=self.tracks[0].offset,
)

for pos, track in enumerate(self.tracks[1:], start=1):
Expand Down
15 changes: 15 additions & 0 deletions test/cut/test_cut_mixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,16 @@ def mixed_audio_cut() -> MixedCut:
return mixed_cut


@pytest.fixture
def offseted_mixed_audio_cut() -> MixedCut:
cut_set = CutSet.from_json(
"test/fixtures/mix_cut_test/offseted_audio_cut_manifest.json"
)
mixed_cut = cut_set["mixed-cut-id"]
assert isclose(mixed_cut.duration, 16.66)
return mixed_cut


def test_mixed_cut_load_audio_mixed(mixed_audio_cut):
audio = mixed_audio_cut.load_audio()
assert audio.shape == (1, 230400)
Expand All @@ -193,6 +203,11 @@ def test_mixed_cut_load_audio_unmixed(mixed_audio_cut):
assert audio[1].shape == (1, 230400)


def test_mixed_cut_load_offseted_mixed(offseted_mixed_audio_cut):
audio = offseted_mixed_audio_cut.load_audio()
assert audio.shape == (1, 266560)


@pytest.mark.parametrize(
"mixed, mono_downmix",
[
Expand Down
21 changes: 20 additions & 1 deletion test/cut/test_cut_truncate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

from lhotse import RecordingSet
from lhotse.cut import CutSet, MixedCut, MixTrack, MonoCut
from lhotse.cut import CutSet, MixedCut, MixTrack, MonoCut, PaddingCut
from lhotse.features import Features
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.testing.dummies import DummyManifest, dummy_cut, dummy_recording
Expand Down Expand Up @@ -139,6 +139,17 @@ def simple_mixed_cut():
)


@pytest.fixture
def gapped_mixed_cut():
return MixedCut(
id="gapped-mixed-cut",
tracks=[
MixTrack(cut=dummy_cut(0, duration=10.0)),
MixTrack(cut=dummy_cut(1, duration=10.0), offset=15.0),
],
)


def test_truncate_mixed_cut_without_args(simple_mixed_cut):
truncated_cut = simple_mixed_cut.truncate()
assert truncated_cut.duration == 15.0
Expand Down Expand Up @@ -214,6 +225,14 @@ def test_truncate_mixed_cut_with_small_offset_and_duration(simple_mixed_cut):
assert truncated_cut.duration == 13.0


def test_truncate_mixed_cut_inside_gap(gapped_mixed_cut):
truncated_cut = gapped_mixed_cut.truncate(offset=11.0, duration=3.0)
assert isinstance(truncated_cut, PaddingCut)
assert truncated_cut.start == 0.0
assert truncated_cut.duration == 3.0
assert truncated_cut.end == 3.0


def test_truncate_cut_set_offset_start(cut_set):
truncated_cut_set = cut_set.truncate(max_duration=5, offset_type="start")
cut1, cut2 = truncated_cut_set
Expand Down
83 changes: 83 additions & 0 deletions test/fixtures/mix_cut_test/offseted_audio_cut_manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
[
{
"id": "mixed-cut-id",
"tracks": [
{
"cut": {
"channel": 0,
"id": "ab34ea6b-9d68-4113-8386-b4c585e596f5",
"duration": 11.66,
"recording": {
"duration": 11.66,
"id": "2412-153948-0000",
"num_samples": 186560,
"sampling_rate": 16000,
"sources": [
{
"channels": [
0
],
"source": "test/fixtures/mix_cut_test/audio/storage/2412-153948-0000.flac",
"type": "file"
}
]
},
"start": 0,
"supervisions": [
{
"channel": 0,
"duration": 11.66,
"id": "2412-153948-0000",
"language": "English",
"recording_id": "2412-153948-0000",
"speaker": "2412",
"start": 0.0,
"text": "IF THE READER WILL EXCUSE ME I WILL SAY NOTHING OF MY ANTECEDENTS NOR OF THE CIRCUMSTANCES WHICH LED ME TO LEAVE MY NATIVE COUNTRY THE NARRATIVE WOULD BE TEDIOUS TO HIM AND PAINFUL TO MYSELF"
}
]
},
"type": "MonoCut",
"offset": 5.0
},
{
"cut": {
"channel": 0,
"id": "07d009a9-e7a1-4611-aa21-32ff9b43dff0",
"duration": 10.51,
"recording": {
"duration": 10.51,
"id": "2412-153948-0001",
"num_samples": 168160,
"sampling_rate": 16000,
"sources": [
{
"channels": [
0
],
"source": "test/fixtures/mix_cut_test/audio/storage/2412-153948-0001.flac",
"type": "file"
}
]
},
"start": 0,
"supervisions": [
{
"channel": 0,
"duration": 4.65,
"id": "174-168635-0001",
"language": "English",
"recording_id": "174-168635-0001",
"speaker": "174",
"start": 0.0,
"text": "THE HEART OF THAT EX CONVICT WAS FULL OF VIRGINITY"
}
]
},
"type": "MonoCut",
"offset": 5.02,
"snr": 20.0
}
],
"type": "MixedCut"
}
]

0 comments on commit d430776

Please sign in to comment.