Skip to content

Commit 5c5c51c

Browse files
authored
Merge pull request #486 from LAAC-LSCP/samplers/session_id
Samplers/session
2 parents 44a80cf + 8fc3c11 commit 5c5c51c

File tree

7 files changed

+92
-31
lines changed

7 files changed

+92
-31
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file.
99
- conversations summary extraction pipeline
1010
- docs and tests for init command
1111
- docs and tests for automated-import command
12+
- option '--by' for periodic sampling along with doc and tests
1213

1314
### Fixed
1415

ChildProject/pipelines/samplers.py

+35-16
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ def __init__(
224224
period: int,
225225
offset: int = 0,
226226
profile: str = None,
227+
by: str = "recording_filename",
227228
recordings: Union[str, List[str], pd.DataFrame] = None,
228229
exclude: Union[str, pd.DataFrame] = None,
229230
):
@@ -233,6 +234,7 @@ def __init__(
233234
self.period = int(period)
234235
self.offset = int(offset)
235236
self.profile = profile
237+
self.by = by
236238

237239
def _sample(self):
238240
recordings = self.project.get_recordings_from_list(self.recordings)
@@ -250,23 +252,34 @@ def _sample(self):
250252

251253
recordings["duration"].fillna(0, inplace=True)
252254

253-
self.segments = recordings[["recording_filename", "duration"]].copy()
254-
self.segments["segment_onset"] = self.segments.apply(
255-
lambda row: np.arange(
256-
self.offset,
257-
row["duration"] - self.length + 1e-4,
255+
recordings = recordings.copy()
256+
recordings['start_ts'] = recordings.apply(
257+
lambda row: int(pd.Timestamp(str(row['date_iso']) + 'T' + str(row['start_time'])).timestamp()) * 1000,
258+
axis=1)
259+
recordings['end_ts'] = recordings['start_ts'] + recordings['duration']
260+
segments = []
261+
# work by groups (by argument), create a singular timeline from those groups and choose periodic segments from there
262+
# this means that recordings following each other will maintain continuity in sampling period
263+
# also means concurrent recordings in the same session will have the same samples kept time/date wise regardless of shifts in start
264+
for i, gdf in recordings.groupby(self.by):
265+
all_segments = pd.DataFrame({'segment_onset': np.arange(
266+
gdf['start_ts'].min() + self.offset,
267+
gdf['end_ts'].max(),
258268
self.period + self.length,
259-
),
260-
axis=1,
261-
)
262-
self.segments = self.segments.explode("segment_onset")
263-
# discard recordings that can't include segments (they are NA here bc explode keeps empty lists)
264-
self.segments = self.segments.dropna(subset=['segment_onset'])
265-
self.segments["segment_onset"] = self.segments["segment_onset"].astype(int)
266-
self.segments["segment_offset"] = self.segments["segment_onset"] + self.length
267-
self.segments.rename(
268-
columns={"recording_filename": "recording_filename"}, inplace=True
269-
)
269+
)})
270+
all_segments['segment_offset'] = all_segments['segment_onset'] + self.length
271+
rec_segments = []
272+
for recording in gdf.to_dict(orient='records'):
273+
tmp_segs = all_segments[(all_segments['segment_offset'] > recording['start_ts']) & (all_segments['segment_onset'] < recording['end_ts'])].copy()
274+
# cut down overflowing stamps
275+
tmp_segs['segment_onset'] = tmp_segs['segment_onset'].apply(lambda x: max(x, recording['start_ts']))
276+
tmp_segs['segment_offset'] = tmp_segs['segment_offset'].apply(lambda x: min(x, recording['end_ts']))
277+
tmp_segs['segment_onset'] = tmp_segs['segment_onset'] - recording['start_ts']
278+
tmp_segs['segment_offset'] = tmp_segs['segment_offset'] - recording['start_ts']
279+
tmp_segs['recording_filename'] = recording['recording_filename']
280+
rec_segments.append(tmp_segs)
281+
segments.append(pd.concat(rec_segments))
282+
self.segments = pd.concat(segments)
270283

271284
return self.segments
272285

@@ -297,6 +310,12 @@ def add_parser(subparsers, subcommand):
297310
default="",
298311
type=str,
299312
)
313+
parser.add_argument(
314+
"--by",
315+
help="units to sample from (default behavior is to sample by recording)",
316+
choices=["recording_filename", "session_id", "child_id"],
317+
default="recording_filename",
318+
)
300319

301320

302321
class RandomVocalizationSampler(Sampler):

docs/source/samplers.rst

+11-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,17 @@ All samplers have a few parameters in common:
3535
Periodic sampler
3636
~~~~~~~~~~~~~~~~
3737

38-
Draw segments from the recordings, periodically
38+
Draw segments from the recordings periodically.
39+
40+
The ``--period`` argument is between the end of the previous segment until the start of the next. For example
41+
length:60000(1min) period:3540000(59min) will sample the first minute of every hour whereas length:60000(1min)
42+
period:3600000(1h) will sample the 1st min of the 1st hour, the 2nd min o the 2nd hour and so on.
43+
44+
The ``--by`` argument will group recordings to form a single timeline in which the periodicity defines the parts
45+
to annotate, then those parts are extracted from the recordings of the group. this means that recordings following
46+
each other will maintain continuity in sampling period if in the same session and sampling by session_id. It also means
47+
concurrent recordings in the same session will have the same samples kept time/date wise regardless of shifts in start.
48+
The default is to sample by 'recording_filename' which will simply periodicly sample each recording independently.
3949

4050
.. clidoc::
4151

Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
child_id,recording_device_type,experiment,notes,recording_filename,start_time,noisy_setting,date_iso,duration,discard
2-
1,usb,test,VERY confidential notes,sound.wav,9:00,1,2020-04-20,4000,
3-
1,usb,test,,sound2.wav,9:00,1,2020-04-21,4000,0
4-
1,usb,test,discarded audio,sound3.wav,10:00,0,2020-04-22,10000,1
1+
child_id,recording_device_type,experiment,notes,recording_filename,start_time,noisy_setting,date_iso,duration,discard,session_id
2+
1,usb,test,VERY confidential notes,sound.wav,9:00,1,2020-04-20,4000,,s1
3+
1,usb,test,,sound2.wav,9:00,1,2020-04-21,4000,0,s1
4+
1,usb,test,discarded audio,sound3.wav,10:00,0,2020-04-22,10000,1,s1

tests/test_samplers.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pytest
44
import shutil
55
from functools import partial
6+
from pathlib import Path
67

78
from ChildProject.projects import ChildProject
89
from ChildProject.annotations import AnnotationManager
@@ -15,32 +16,39 @@
1516
SamplerPipeline,
1617
)
1718

19+
TRUTH = Path('tests', 'truth')
20+
PATH = Path('output', 'samplers')
1821

1922
def fake_conversation(data, filename):
2023
return data
2124

2225

2326
@pytest.fixture(scope="function")
2427
def project(request):
25-
if not os.path.exists("output/samplers"):
26-
shutil.copytree(src="examples/valid_raw_data", dst="output/samplers")
28+
if os.path.exists(PATH):
29+
# shutil.copytree(src="examples/valid_raw_data", dst="output/annotations")
30+
shutil.rmtree(PATH)
31+
shutil.copytree(src="examples/valid_raw_data", dst=PATH)
2732

28-
project = ChildProject("output/samplers")
33+
project = ChildProject(PATH)
2934
project.read()
30-
yield project
3135

36+
yield project
3237

33-
def test_periodic(project):
38+
@pytest.mark.parametrize("by,truth",
39+
[('recording_filename', TRUTH / 'sampler' / 'periodic_rec.csv'),
40+
('session_id', TRUTH / 'sampler' / 'periodic_sess.csv'),
41+
])
42+
def test_periodic(project, by, truth):
3443
sampler = PeriodicSampler(
35-
project=project, length=1000, period=1000, recordings=["sound.wav"]
44+
project=project, offset=1000, length=500, period=200, recordings=["sound.wav",'sound2.wav'], by=by
3645
)
3746
sampler.sample()
3847

39-
duration = project.recordings[
40-
project.recordings["recording_filename"] == "sound.wav"
41-
]["duration"].iloc[0]
48+
# sampler.segments.to_csv(truth, index=False)
49+
truth = pd.read_csv(truth)
4250

43-
assert len(sampler.segments) == int(duration / (1000 + 1000))
51+
pd.testing.assert_frame_equal(sampler.segments.reset_index(drop=True), truth, check_like=True)
4452

4553

4654
def test_energy_detection(project):

tests/truth/sampler/periodic_rec.csv

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
segment_onset,segment_offset,recording_filename
2+
1000,1500,sound.wav
3+
1700,2200,sound.wav
4+
2400,2900,sound.wav
5+
3100,3600,sound.wav
6+
3800,4000,sound.wav
7+
1000,1500,sound2.wav
8+
1700,2200,sound2.wav
9+
2400,2900,sound2.wav
10+
3100,3600,sound2.wav
11+
3800,4000,sound2.wav

tests/truth/sampler/periodic_sess.csv

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
segment_onset,segment_offset,recording_filename
2+
1000,1500,sound.wav
3+
1700,2200,sound.wav
4+
2400,2900,sound.wav
5+
3100,3600,sound.wav
6+
3800,4000,sound.wav
7+
0,400,sound2.wav
8+
600,1100,sound2.wav
9+
1300,1800,sound2.wav
10+
2000,2500,sound2.wav
11+
2700,3200,sound2.wav
12+
3400,3900,sound2.wav

0 commit comments

Comments
 (0)