Skip to content

Fix bugs in MixedCut logic #1073

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion lhotse/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -1308,6 +1308,7 @@ def __init__(
base_audio: np.ndarray,
sampling_rate: int,
reference_energy: Optional[float] = None,
base_offset: Seconds = 0.0,
):
"""
AudioMixer's constructor.
Expand All @@ -1317,9 +1318,10 @@ def __init__(
:param sampling_rate: Sampling rate of the audio.
:param reference_energy: Optionally pass a reference energy value to compute SNRs against.
This might be required when ``base_audio`` corresponds to zero-padding.
:param base_offset: Optionally pass a time offset for the base signal.
"""
self.tracks = [base_audio]
self.offsets = [0]
self.offsets = [compute_num_samples(base_offset, sampling_rate)]
self.sampling_rate = sampling_rate
self.num_channels = base_audio.shape[0]
self.dtype = self.tracks[0].dtype
Expand Down
12 changes: 12 additions & 0 deletions lhotse/cut/mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,17 @@ def truncate(
snr=track.snr,
)
)

# Edge case: no tracks left after truncation. This can happen if we truncated an offset region.
# In this case, return a PaddingCut of the requested duration
if len(new_tracks) == 0:
return PaddingCut(
id=self.id if preserve_id else str(uuid4()),
duration=duration,
sampling_rate=self.sampling_rate,
feat_value=0.0,
)

if len(new_tracks) == 1:
# The truncation resulted in just a single cut - simply return it.
return new_tracks[0].cut
Expand Down Expand Up @@ -1047,6 +1058,7 @@ def load_audio(
self.tracks[0].cut.load_audio(),
sampling_rate=self.tracks[0].cut.sampling_rate,
reference_energy=reference_energy,
base_offset=self.tracks[0].offset,
)

for pos, track in enumerate(self.tracks[1:], start=1):
Expand Down
15 changes: 15 additions & 0 deletions test/cut/test_cut_mixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,16 @@ def mixed_audio_cut() -> MixedCut:
return mixed_cut


@pytest.fixture
def offseted_mixed_audio_cut() -> MixedCut:
cut_set = CutSet.from_json(
"test/fixtures/mix_cut_test/offseted_audio_cut_manifest.json"
)
mixed_cut = cut_set["mixed-cut-id"]
assert isclose(mixed_cut.duration, 16.66)
return mixed_cut


def test_mixed_cut_load_audio_mixed(mixed_audio_cut):
audio = mixed_audio_cut.load_audio()
assert audio.shape == (1, 230400)
Expand All @@ -193,6 +203,11 @@ def test_mixed_cut_load_audio_unmixed(mixed_audio_cut):
assert audio[1].shape == (1, 230400)


def test_mixed_cut_load_offseted_mixed(offseted_mixed_audio_cut):
audio = offseted_mixed_audio_cut.load_audio()
assert audio.shape == (1, 266560)


@pytest.mark.parametrize(
"mixed, mono_downmix",
[
Expand Down
21 changes: 20 additions & 1 deletion test/cut/test_cut_truncate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

from lhotse import RecordingSet
from lhotse.cut import CutSet, MixedCut, MixTrack, MonoCut
from lhotse.cut import CutSet, MixedCut, MixTrack, MonoCut, PaddingCut
from lhotse.features import Features
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.testing.dummies import DummyManifest, dummy_cut, dummy_recording
Expand Down Expand Up @@ -139,6 +139,17 @@ def simple_mixed_cut():
)


@pytest.fixture
def gapped_mixed_cut():
return MixedCut(
id="gapped-mixed-cut",
tracks=[
MixTrack(cut=dummy_cut(0, duration=10.0)),
MixTrack(cut=dummy_cut(1, duration=10.0), offset=15.0),
],
)


def test_truncate_mixed_cut_without_args(simple_mixed_cut):
truncated_cut = simple_mixed_cut.truncate()
assert truncated_cut.duration == 15.0
Expand Down Expand Up @@ -214,6 +225,14 @@ def test_truncate_mixed_cut_with_small_offset_and_duration(simple_mixed_cut):
assert truncated_cut.duration == 13.0


def test_truncate_mixed_cut_inside_gap(gapped_mixed_cut):
truncated_cut = gapped_mixed_cut.truncate(offset=11.0, duration=3.0)
assert isinstance(truncated_cut, PaddingCut)
assert truncated_cut.start == 0.0
assert truncated_cut.duration == 3.0
assert truncated_cut.end == 3.0


def test_truncate_cut_set_offset_start(cut_set):
truncated_cut_set = cut_set.truncate(max_duration=5, offset_type="start")
cut1, cut2 = truncated_cut_set
Expand Down
83 changes: 83 additions & 0 deletions test/fixtures/mix_cut_test/offseted_audio_cut_manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
[
{
"id": "mixed-cut-id",
"tracks": [
{
"cut": {
"channel": 0,
"id": "ab34ea6b-9d68-4113-8386-b4c585e596f5",
"duration": 11.66,
"recording": {
"duration": 11.66,
"id": "2412-153948-0000",
"num_samples": 186560,
"sampling_rate": 16000,
"sources": [
{
"channels": [
0
],
"source": "test/fixtures/mix_cut_test/audio/storage/2412-153948-0000.flac",
"type": "file"
}
]
},
"start": 0,
"supervisions": [
{
"channel": 0,
"duration": 11.66,
"id": "2412-153948-0000",
"language": "English",
"recording_id": "2412-153948-0000",
"speaker": "2412",
"start": 0.0,
"text": "IF THE READER WILL EXCUSE ME I WILL SAY NOTHING OF MY ANTECEDENTS NOR OF THE CIRCUMSTANCES WHICH LED ME TO LEAVE MY NATIVE COUNTRY THE NARRATIVE WOULD BE TEDIOUS TO HIM AND PAINFUL TO MYSELF"
}
]
},
"type": "MonoCut",
"offset": 5.0
},
{
"cut": {
"channel": 0,
"id": "07d009a9-e7a1-4611-aa21-32ff9b43dff0",
"duration": 10.51,
"recording": {
"duration": 10.51,
"id": "2412-153948-0001",
"num_samples": 168160,
"sampling_rate": 16000,
"sources": [
{
"channels": [
0
],
"source": "test/fixtures/mix_cut_test/audio/storage/2412-153948-0001.flac",
"type": "file"
}
]
},
"start": 0,
"supervisions": [
{
"channel": 0,
"duration": 4.65,
"id": "174-168635-0001",
"language": "English",
"recording_id": "174-168635-0001",
"speaker": "174",
"start": 0.0,
"text": "THE HEART OF THAT EX CONVICT WAS FULL OF VIRGINITY"
}
]
},
"type": "MonoCut",
"offset": 5.02,
"snr": 20.0
}
],
"type": "MixedCut"
}
]